The data that I am using for this project can be accessed from https://www.kaggle.com/datasets/shashwatwork/municipal-waste-management-cost-prediction?resource=download
waste_data <- read.csv("~/Desktop/Machine Learning/public_data_waste_fee.csv")
str(waste_data)
## 'data.frame': 4341 obs. of 39 variables:
## $ region : chr "Emilia_Romagna" "Emilia_Romagna" "Emilia_Romagna" "Emilia_Romagna" ...
## $ province : chr "Ferrara" "Ferrara" "Ferrara" "Ferrara" ...
## $ name : chr "Comacchio" "Lagosanto" "Goro" "Mesola" ...
## $ tc : num 502 228 268 199 234 ...
## $ cres : num 129.3 49.5 50.6 41.1 58.3 ...
## $ csor : num 66.4 44.1 44.6 40.4 26 ...
## $ istat : int 38006 38011 38025 38014 110005 38010 38030 58120 27005 8050 ...
## $ area : num 283.8 34.4 26.6 84.3 35.7 ...
## $ pop : int 22648 4952 3895 7140 12193 3003 7364 67626 11793 2861 ...
## $ alt : int 1 1 1 1 1 1 1 1 1 2 ...
## $ isle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sea : int 1 1 1 1 1 0 0 1 1 1 ...
## $ pden : num 79.8 143.8 146.3 84.7 341.5 ...
## $ wden : num 119671 70031 81117 43320 201565 ...
## $ urb : int 2 3 3 3 2 3 3 2 2 2 ...
## $ fee : chr "PAYT" "PAYT" "PAYT" "PAYT" ...
## $ d_fee : int 1 1 1 1 0 1 1 0 0 0 ...
## $ sample : int 1 1 1 1 0 1 1 0 1 0 ...
## $ organic : num NA 35.041 37.377 45.31 0.428 ...
## $ paper : num 4.36 9.89 11.99 9.76 6.6 ...
## $ glass : num 3.59 9.52 6.65 7.55 4.33 ...
## $ wood : num 2.27 4.00 1.32e-05 1.49e-01 2.30 ...
## $ metal : num 0.462 1.861 0.745 0.747 0.103 ...
## $ plastic : num 1.13 4.64 5.22 5.2 5.12 ...
## $ raee : num 0.346 1.609 NA NA 0.275 ...
## $ texile : num 0.112 0.351 0.449 0.4 0.287 ...
## $ other : num 3.2 9.02 16.04 9.77 4.05 ...
## $ msw_so : num 20396261 1831407 1694922 2881055 3026700 ...
## $ msw_un : int 13560520 580460 464400 770860 4169180 349620 556540 7895520 5659520 296480 ...
## $ msw : int 33956781 2411867 2159322 3651915 7195880 1682628 3336429 33435410 15175582 1553789 ...
## $ sor : num 60.1 75.9 78.5 78.9 42.1 ...
## $ geo : num 3 3 3 3 1 3 NA 2 3 3 ...
## $ roads : num 285 11 49 165 60 65 NA 329 77 17 ...
## $ s_wteregio: num 33.11 33.11 33.11 33.11 4.05 ...
## $ s_landfill: num 15.2 15.2 15.2 15.2 45.4 ...
## $ gdp : num 7.27 7.11 7.27 7.09 7.25 ...
## $ proads : num 4.35 6.08 4.34 3.71 5.27 ...
## $ wage : num 9.44 9.51 8.89 9.43 9.13 ...
## $ finance : num 7.49 7.32 7.49 7.3 7.46 ...
names(waste_data)
## [1] "region" "province" "name" "tc" "cres"
## [6] "csor" "istat" "area" "pop" "alt"
## [11] "isle" "sea" "pden" "wden" "urb"
## [16] "fee" "d_fee" "sample" "organic" "paper"
## [21] "glass" "wood" "metal" "plastic" "raee"
## [26] "texile" "other" "msw_so" "msw_un" "msw"
## [31] "sor" "geo" "roads" "s_wteregio" "s_landfill"
## [36] "gdp" "proads" "wage" "finance"
sum(is.na(waste_data))
## [1] 6372
summary(as.factor(waste_data$region))
## Abruzzo Basilicata Calabria
## 155 69 165
## Campania Emilia_Romagna Friuli_Venezia_Giulia
## 405 308 183
## Lazio Liguria Lombardia
## 188 179 1230
## Marche Molise piemonte
## 103 70 114
## Puglia Sardegna Sicilia
## 126 121 212
## Toscana Trentino_Alto_Adige Umbria
## 231 157 59
## Valle_d'Aosta Veneto
## 1 265
While this data comes in relatively clean, there are a few important issues that need to be addressed. Specifically, removing the NA values and replaces them with 0. As well as converts the certain variable columns to a factor if it is not already one. Factors are used to represent categorical data in R. Each unique value in the column is treated as a level of the factor.
waste_data[is.na(waste_data)] <- 0
summary(waste_data)
## region province name tc
## Length:4341 Length:4341 Length:4341 Min. : 25.69
## Class :character Class :character Class :character 1st Qu.:108.04
## Mode :character Mode :character Mode :character Median :136.62
## Mean :154.24
## 3rd Qu.:179.16
## Max. :977.42
## cres csor istat area
## Min. : 0.00 Min. : 0.00 Min. : 1272 Min. : 0.00
## 1st Qu.: 26.94 1st Qu.: 30.04 1st Qu.: 18135 1st Qu.: 10.81
## Median : 41.46 Median : 48.56 Median : 42015 Median : 22.71
## Mean : 53.53 Mean : 51.87 Mean : 47470 Mean : 40.94
## 3rd Qu.: 65.98 3rd Qu.: 66.03 3rd Qu.: 70049 3rd Qu.: 47.45
## Max. :670.32 Max. :582.16 Max. :111107 Max. :1287.39
## pop alt isle sea
## Min. : 34 Min. : 0.0 Min. :0.000000 Min. :0.0000
## 1st Qu.: 1579 1st Qu.: 79.0 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 3535 Median : 239.0 Median :0.000000 Median :0.0000
## Mean : 10204 Mean : 309.6 Mean :0.005068 Mean :0.1682
## 3rd Qu.: 8199 3rd Qu.: 459.0 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :2617175 Max. :1816.0 Max. :1.000000 Max. :1.0000
## pden wden urb fee
## Min. : 0.0 Min. : 0 Min. :0.000 Length:4341
## 1st Qu.: 62.0 1st Qu.: 23441 1st Qu.:2.000 Class :character
## Median : 150.8 Median : 68255 Median :3.000 Mode :character
## Mean : 404.5 Mean : 191792 Mean :2.487
## 3rd Qu.: 399.0 3rd Qu.: 194281 3rd Qu.:3.000
## Max. :12122.8 Max. :4978556 Max. :3.000
## d_fee sample organic paper
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 4.675 1st Qu.: 8.614
## Median :0.0000 Median :1.0000 Median :22.630 Median :10.856
## Mean :0.1283 Mean :0.5469 Mean :19.648 Mean :10.898
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:30.907 3rd Qu.:13.051
## Max. :1.0000 Max. :1.0000 Max. :61.639 Max. :45.288
## glass wood metal plastic
## Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 7.085 1st Qu.: 0.000 1st Qu.: 0.7456 1st Qu.: 4.069
## Median : 9.071 Median : 2.692 Median : 1.4697 Median : 5.762
## Mean : 9.335 Mean : 3.076 Mean : 1.6644 Mean : 6.057
## 3rd Qu.:11.257 3rd Qu.: 5.080 3rd Qu.: 2.2983 3rd Qu.: 7.506
## Max. :39.836 Max. :25.117 Max. :20.6715 Max. :31.605
## raee texile other msw_so
## Min. : 0.0000 Min. : 0.000000 Min. : 0.000 Min. : 0
## 1st Qu.: 0.6493 1st Qu.: 0.000053 1st Qu.: 3.606 1st Qu.: 373965
## Median : 1.1196 Median : 0.457738 Median : 6.887 Median : 1040737
## Mean : 1.1439 Mean : 0.580364 Mean : 7.693 Mean : 3248581
## 3rd Qu.: 1.5332 3rd Qu.: 0.854879 3rd Qu.:11.002 3rd Qu.: 2725645
## Max. :17.9536 Max. :10.584472 Max. :37.156 Max. :765130099
## msw_un msw sor geo
## Min. : 6185 Min. :1.997e+04 Min. : 0.25 Min. :0.00
## 1st Qu.: 175180 1st Qu.:6.117e+05 1st Qu.:57.83 1st Qu.:1.00
## Median : 409060 Median :1.524e+06 Median :70.84 Median :3.00
## Mean : 2042522 Mean :5.311e+06 Mean :66.24 Mean :2.14
## 3rd Qu.: 1056920 3rd Qu.:3.954e+06 3rd Qu.:79.09 3rd Qu.:3.00
## Max. :926757220 Max. :1.692e+09 Max. :97.48 Max. :3.00
## roads s_wteregio s_landfill gdp
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 19.00 1st Qu.: 5.634 1st Qu.: 4.551 1st Qu.: 6.725
## Median : 45.00 Median :18.540 Median :11.286 Median : 7.060
## Mean : 91.53 Mean :20.432 Mean :18.688 Mean : 6.588
## 3rd Qu.: 96.00 3rd Qu.:38.501 3rd Qu.:31.493 3rd Qu.: 7.511
## Max. :14970.00 Max. :65.122 Max. :92.532 Max. :10.539
## proads wage finance
## Min. :-3.367 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.076 1st Qu.: 9.143 1st Qu.: 6.926
## Median : 4.191 Median : 9.499 Median : 7.272
## Mean : 3.820 Mean : 8.827 Mean : 6.786
## 3rd Qu.: 4.997 3rd Qu.: 9.654 3rd Qu.: 7.736
## Max. : 8.980 Max. :10.485 Max. :10.855
summary(as.factor(waste_data$province))
## Bergamo Brescia Milano
## 218 172 131
## Bolzano Salerno Como
## 115 112 106
## Cremona Udine Varese
## 102 102 99
## Vicenza Pavia Lecco
## 97 94 80
## Napoli Avellino Caserta
## 80 79 78
## Roma Chieti Cosenza
## 70 69 69
## Biella Mantova Sondrio
## 64 63 62
## Padova Savona Benevento
## 60 59 56
## Bologna Monza e della Brianza Potenza
## 54 54 54
## Imperia Pordenone Campobasso
## 52 50 49
## Catania Lodi Genova
## 49 49 48
## Lecce Novara Piacenza
## 47 46 46
## Palermo Pesaro e Urbino Frosinone
## 45 45 44
## Venezia Verona Messina
## 44 43 42
## Parma Reggio nell'Emilia Trento
## 42 42 42
## Firenze L'Aquila Modena
## 40 37 36
## Pisa Sud Sardegna Perugia
## 36 36 35
## Viterbo Sassari Pescara
## 35 34 31
## Reggio di Calabria Forli'-Cesena Nuoro
## 31 30 30
## Catanzaro Lucca Fermo
## 29 29 28
## Arezzo Grosseto Gorizia
## 27 26 25
## Vibo Valentia Bari Terni
## 25 24 24
## Foggia Agrigento Isernia
## 23 21 21
## Latina Belluno Ferrara
## 21 20 20
## La Spezia Pistoia Rimini
## 20 20 20
## Trapani Ravenna Rieti
## 19 18 18
## Siena Taranto Teramo
## 18 18 18
## Livorno Siracusa Ancona
## 17 17 16
## Cagliari Matera Ascoli Piceno
## 16 15 14
## Massa-Carrara Crotone Enna
## 12 11 8
## Ragusa Barletta-Andria-Trani Brindisi
## 8 7 7
## Prato Trieste Oristano
## 6 6 5
## Caltanissetta Asti Aosta
## 3 2 1
## (Other)
## 3
summary(as.factor(waste_data$name))
## Castro Peglio
## 6 2 2
## Abano Terme Abbadia Lariana Abbateggio
## 1 1 1
## Abbiategrasso Abetone Cutigliano Acate
## 1 1 1
## Acerra Aci Bonaccorsi Aci Castello
## 1 1 1
## Acireale Acquafredda Acqualagna
## 1 1 1
## Acquanegra sul Chiese Acquasparta Acquaviva d'Isernia
## 1 1 1
## Acquaviva delle Fonti Acquedolci Acri
## 1 1 1
## Acuto Adelfia Adrano
## 1 1 1
## Adrara San Martino Adrara San Rocco Adro
## 1 1 1
## Affi Affile Agazzano
## 1 1 1
## Agerola Agliana Agna
## 1 1 1
## Agnadello Agnosine Agrate Brianza
## 1 1 1
## Agrate Conturbia Agrigento Agropoli
## 1 1 1
## Agugliano Aicurzio Aielli
## 1 1 1
## Aiello Calabro Aiello del Friuli Aiello del Sabato
## 1 1 1
## Ailano Ailoche Airola
## 1 1 1
## Airole Airuno Ala
## 1 1 1
## Alà dei Sardi Alanno Alano di Piave
## 1 1 1
## Alassio Albairate Albano Laziale
## 1 1 1
## Albano Sant'Alessandro Albaredo Arnaboldi Albaredo d'Adige
## 1 1 1
## Albaredo per San Marco Albareto Albenga
## 1 1 1
## Alberobello Albettone Albiate
## 1 1 1
## Albignasego Albinea Albino
## 1 1 1
## Albiolo Albisola Superiore Albissola Marina
## 1 1 1
## Albizzate Albosaggia Albuzzano
## 1 1 1
## Alcamo Aldino/Aldein Alessandria della Rocca
## 1 1 1
## Alessano Alezio Alfano
## 1 1 1
## Alfedena Alfonsine Alghero
## 1 1 1
## Algua Allerona Alliste
## 1 1 1
## Allumiere Almè Almenno San Bartolomeo
## 1 1 1
## Almenno San Salvatore Alonte Alseno
## 1 1 1
## Alta Val Tidone Alta Valle Intelvi Altare
## 1 1 1
## Altavilla Irpina Altavilla Silentina Altavilla Vicentina
## 1 1 1
## (Other)
## 4235
summary(as.factor(waste_data$fee))
## PAYT STANDARD
## 557 3784
# Load necessary libraries
library(ggplot2)
library(reshape2)
# Assuming `waste_data` is your cleaned dataset in R with columns 'tc', 'cres', and 'csor'
# Reshape data to long format for ggplot
waste_long <- melt(waste_data, measure.vars = c("tc", "cres", "csor"),
variable.name = "Cost_Type", value.name = "Cost")
# Create smooth density plot
ggplot(waste_long, aes(x = Cost, fill = Cost_Type, color = Cost_Type)) +
geom_density(alpha = 0.5, size = 1) +
labs(title = "Comparison of Total, Sorted, and Residual Costs",
x = "Cost", y = "Density", fill = "Cost Type", color = "Cost Type") +
theme_minimal() +
scale_fill_manual(values = c("tc" = "blue", "cres" = "green", "csor" = "red")) +
scale_color_manual(values = c("tc" = "blue", "cres" = "green", "csor" = "red"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Interpretations: cres = residual cost per capita csor = Sorted cost per capita
Total Costs dominate in scale compared to the other two types, evidenced by its higher density for larger cost values. Residual Costs are generally lower, with a sharp peak close to zero and minimal spread beyond that. Sorted Costs have the least density overall, implying a minor contribution to the total cost framework. The plot highlights significant differences in the scale and distribution of the three cost types. Most costs are concentrated in the lower ranges for all three categories, but Total Costs have the most variability and presence at higher values. This may suggest that while the Total Costs include broad contributions, Residual and Sorted Costs are more specific and less impactful overall.
model <- lm(tc ~ pop, data = waste_data)
# Plot with ggplot2, including the regression line
library(ggplot2)
ggplot(waste_data, aes(x = log(pop + 1), y = tc)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
labs(title = "Relationship between TC and Population",
x = "Population",
y = "TC (Cost)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretations: This scatterplot visualizes the relationship between TC (Cost) on the y-axis and Population on the x-axis where there is little no relationship expressed here. The trend line is nearly horizontal, showing that changes in population do not systematically affect the cost (TC). This suggests little or no correlation between the two variables
plot(waste_data$geo,waste_data$tc, xlab="geo",
ylab="Total Cost per capita eur", main="Cost per capita eur v.s. Geo")
boxplot(waste_data$tc~waste_data$geo, xlab="geo",
ylab="Total Cost per capita eur", main="Cost per capita eur v.s. Geo")
Interpretations: There appears to be variability in the cost per capita across the different geographical categories (represented by geo values 0, 1, 1.5, 2, and 3). Some geo categories might consistently have lower median costs, while others have higher medians with larger spreads. The presence of many outliers across geo categories suggests a wide range of costs that may not fit typical trends, requiring further analysis.
plot(waste_data$sea, waste_data$tc)
ggplot(waste_data, aes(x = factor(sea), y= tc)) +
geom_boxplot()
Interpretations: sea is a categorical variable with two levels (0 and 1). It represents whether a region is related to “sea” or not (e.g., coastal vs. non-coastal). tc represents the total cost per capita in euros. Total costs are generally higher in regions associated with sea = 1. This could imply that being near the sea is associated with higher costs, potentially due to factors like additional expenses related to coastal infrastructure and higher operational costs in sea-related regions (e.g., waste management, transport logistics).
hist(waste_data$tc, col='lightblue')
Interpretation I recognize that there is a right skew I can correct this with a log transformation
waste_data$log_tc <- log(waste_data$tc+1)
hist(waste_data$log_tc, col = 'lightblue')
Data Partitioning
set.seed(7)
total_obs <- dim(waste_data)[1]
# Data partition / Sample splitting
train_data_indices <- sample(1:total_obs, 0.8*total_obs)
train_data <- waste_data[train_data_indices,]
test_data <- waste_data[-train_data_indices,]
train_obs <- dim(train_data)[1]
Instead of building linear regression models on the log-scale total cost, I will build linear a regression model for the original scale of total cost, i.e. without log transformation to correct the right-skewness of cost.
colnames(train_data)
## [1] "region" "province" "name" "tc" "cres"
## [6] "csor" "istat" "area" "pop" "alt"
## [11] "isle" "sea" "pden" "wden" "urb"
## [16] "fee" "d_fee" "sample" "organic" "paper"
## [21] "glass" "wood" "metal" "plastic" "raee"
## [26] "texile" "other" "msw_so" "msw_un" "msw"
## [31] "sor" "geo" "roads" "s_wteregio" "s_landfill"
## [36] "gdp" "proads" "wage" "finance" "log_tc"
#lm_full <- lm(tc ~ ., data = train_data[, -c(1:3)])
lm_full <- lm(tc~., data=train_data[, 4:(ncol(train_data)- 1)])
print(summary(lm_full))
##
## Call:
## lm(formula = tc ~ ., data = train_data[, 4:(ncol(train_data) -
## 1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -811.27 -24.39 -7.84 15.89 652.33
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.569e+01 8.613e+00 7.627 3.09e-14 ***
## cres 9.447e-01 2.167e-02 43.590 < 2e-16 ***
## csor 8.704e-01 2.588e-02 33.630 < 2e-16 ***
## istat -9.154e-05 3.628e-05 -2.523 0.011678 *
## area 1.202e-01 1.860e-02 6.461 1.19e-10 ***
## pop -1.999e-04 1.363e-04 -1.466 0.142650
## alt 3.556e-04 3.781e-03 0.094 0.925087
## isle 6.727e+01 1.090e+01 6.171 7.56e-10 ***
## sea 1.964e+01 2.581e+00 7.609 3.54e-14 ***
## pden -3.254e-02 5.668e-03 -5.741 1.02e-08 ***
## wden 8.092e-05 1.226e-05 6.599 4.78e-11 ***
## urb -8.449e-01 1.914e+00 -0.442 0.658840
## feeSTANDARD -3.519e-01 2.634e+00 -0.134 0.893736
## d_fee NA NA NA NA
## sample -2.106e+00 2.911e+00 -0.724 0.469364
## organic -3.420e-01 8.756e-02 -3.906 9.57e-05 ***
## paper -5.825e-01 2.572e-01 -2.265 0.023567 *
## glass -9.723e-01 2.531e-01 -3.842 0.000124 ***
## wood -7.476e-01 3.902e-01 -1.916 0.055445 .
## metal 7.028e-03 6.879e-01 0.010 0.991849
## plastic -2.156e-01 3.072e-01 -0.702 0.482912
## raee 2.461e+00 1.054e+00 2.334 0.019628 *
## texile -1.572e+00 1.178e+00 -1.335 0.181960
## other 1.822e-02 2.151e-01 0.085 0.932489
## msw_so 9.516e-07 4.592e-06 0.207 0.835865
## msw_un 6.319e-07 4.716e-06 0.134 0.893432
## msw -5.190e-07 4.676e-06 -0.111 0.911637
## sor 2.535e-01 1.075e-01 2.357 0.018487 *
## geo 1.773e+00 1.929e+00 0.919 0.358056
## roads -3.995e-03 2.910e-03 -1.373 0.169839
## s_wteregio -9.210e-02 6.967e-02 -1.322 0.186277
## s_landfill 2.868e-02 5.534e-02 0.518 0.604298
## gdp 4.743e+00 7.623e-01 6.223 5.48e-10 ***
## proads -3.403e+00 8.175e-01 -4.163 3.22e-05 ***
## wage -2.640e+00 8.190e-01 -3.223 0.001278 **
## finance NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.1 on 3438 degrees of freedom
## Multiple R-squared: 0.6323, Adjusted R-squared: 0.6287
## F-statistic: 179.1 on 33 and 3438 DF, p-value: < 2.2e-16
Backward Selection with BIC
lm_bwd <- step(lm_full, direction='backward', k=log(train_obs))
## Start: AIC=26844.68
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden +
## wden + urb + fee + d_fee + sample + organic + paper + glass +
## wood + metal + plastic + raee + texile + other + msw_so +
## msw_un + msw + sor + geo + roads + s_wteregio + s_landfill +
## gdp + proads + wage + finance
##
##
## Step: AIC=26844.68
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden +
## wden + urb + fee + d_fee + sample + organic + paper + glass +
## wood + metal + plastic + raee + texile + other + msw_so +
## msw_un + msw + sor + geo + roads + s_wteregio + s_landfill +
## gdp + proads + wage
##
##
## Step: AIC=26844.68
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden +
## wden + urb + fee + sample + organic + paper + glass + wood +
## metal + plastic + raee + texile + other + msw_so + msw_un +
## msw + sor + geo + roads + s_wteregio + s_landfill + gdp +
## proads + wage
##
## Df Sum of Sq RSS AIC
## - metal 1 0 7307517 26836
## - other 1 15 7307532 26836
## - alt 1 19 7307535 26836
## - msw 1 26 7307543 26836
## - fee 1 38 7307554 26836
## - msw_un 1 38 7307555 26836
## - msw_so 1 91 7307608 26837
## - urb 1 414 7307931 26837
## - s_landfill 1 571 7308087 26837
## - plastic 1 1047 7308563 26837
## - sample 1 1113 7308629 26837
## - geo 1 1796 7309312 26837
## - s_wteregio 1 3714 7311231 26838
## - texile 1 3788 7311305 26838
## - roads 1 4007 7311523 26838
## - pop 1 4570 7312087 26839
## - wood 1 7803 7315320 26840
## - paper 1 10906 7318422 26842
## - raee 1 11584 7319100 26842
## - sor 1 11807 7319323 26842
## - istat 1 13531 7321047 26843
## <none> 7307517 26845
## - wage 1 22086 7329602 26847
## - glass 1 31374 7338890 26851
## - organic 1 32426 7339942 26852
## - proads 1 36829 7344345 26854
## - pden 1 70050 7377566 26870
## - isle 1 80948 7388465 26875
## - gdp 1 82299 7389816 26875
## - area 1 88730 7396246 26878
## - wden 1 92558 7400075 26880
## - sea 1 123069 7430586 26894
## - csor 1 2403925 9711441 27824
## - cres 1 4038641 11346158 28364
##
## Step: AIC=26836.53
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden +
## wden + urb + fee + sample + organic + paper + glass + wood +
## plastic + raee + texile + other + msw_so + msw_un + msw +
## sor + geo + roads + s_wteregio + s_landfill + gdp + proads +
## wage
##
## Df Sum of Sq RSS AIC
## - other 1 15 7307532 26828
## - alt 1 20 7307537 26828
## - msw 1 26 7307543 26828
## - fee 1 38 7307555 26828
## - msw_un 1 38 7307555 26828
## - msw_so 1 91 7307608 26828
## - urb 1 415 7307932 26829
## - s_landfill 1 572 7308089 26829
## - plastic 1 1048 7308565 26829
## - sample 1 1117 7308634 26829
## - geo 1 1796 7309313 26829
## - s_wteregio 1 3744 7311261 26830
## - texile 1 3791 7311308 26830
## - roads 1 4008 7311525 26830
## - pop 1 4570 7312087 26830
## - wood 1 7928 7315444 26832
## - paper 1 10938 7318454 26834
## - raee 1 11783 7319300 26834
## - sor 1 12043 7319560 26834
## - istat 1 13683 7321200 26835
## <none> 7307517 26836
## - wage 1 22167 7329684 26839
## - glass 1 31382 7338899 26843
## - organic 1 32666 7340183 26844
## - proads 1 36832 7344349 26846
## - pden 1 70224 7377741 26862
## - isle 1 81402 7388918 26867
## - gdp 1 82469 7389986 26867
## - area 1 88819 7396336 26870
## - wden 1 92797 7400314 26872
## - sea 1 123190 7430706 26886
## - csor 1 2405935 9713452 27816
## - cres 1 4038674 11346191 28356
##
## Step: AIC=26828.38
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden +
## wden + urb + fee + sample + organic + paper + glass + wood +
## plastic + raee + texile + msw_so + msw_un + msw + sor + geo +
## roads + s_wteregio + s_landfill + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - alt 1 30 7307562 26820
## - msw 1 32 7307564 26820
## - fee 1 32 7307564 26820
## - msw_un 1 45 7307577 26820
## - msw_so 1 101 7307634 26820
## - urb 1 413 7307945 26820
## - s_landfill 1 560 7308092 26820
## - sample 1 1103 7308635 26821
## - plastic 1 1190 7308722 26821
## - geo 1 1845 7309377 26821
## - s_wteregio 1 3734 7311266 26822
## - texile 1 3856 7311388 26822
## - roads 1 4019 7311551 26822
## - pop 1 4593 7312125 26822
## - wood 1 7928 7315460 26824
## - paper 1 11693 7319225 26826
## - raee 1 11822 7319354 26826
## - istat 1 13688 7321220 26827
## - sor 1 16639 7324171 26828
## <none> 7307532 26828
## - wage 1 22333 7329865 26831
## - glass 1 33615 7341147 26836
## - organic 1 36786 7344318 26838
## - proads 1 36919 7344451 26838
## - pden 1 70557 7378089 26854
## - isle 1 81601 7389134 26859
## - gdp 1 82675 7390207 26859
## - area 1 90858 7398390 26863
## - wden 1 93446 7400978 26864
## - sea 1 123458 7430990 26878
## - csor 1 2414555 9722088 27812
## - cres 1 4043259 11350791 28349
##
## Step: AIC=26820.24
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## urb + fee + sample + organic + paper + glass + wood + plastic +
## raee + texile + msw_so + msw_un + msw + sor + geo + roads +
## s_wteregio + s_landfill + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - msw 1 33 7307595 26812
## - fee 1 38 7307600 26812
## - msw_un 1 47 7307609 26812
## - msw_so 1 104 7307666 26812
## - urb 1 396 7307958 26812
## - s_landfill 1 578 7308140 26812
## - sample 1 1136 7308698 26813
## - plastic 1 1238 7308800 26813
## - geo 1 1835 7309397 26813
## - s_wteregio 1 3707 7311269 26814
## - texile 1 3828 7311390 26814
## - roads 1 4070 7311632 26814
## - pop 1 4601 7312162 26814
## - wood 1 7946 7315508 26816
## - paper 1 11700 7319261 26818
## - raee 1 11887 7319449 26818
## - istat 1 13680 7321241 26819
## - sor 1 16691 7324253 26820
## <none> 7307562 26820
## - wage 1 22718 7330280 26823
## - glass 1 34625 7342187 26828
## - organic 1 36785 7344347 26830
## - proads 1 39377 7346939 26831
## - pden 1 70557 7378118 26846
## - isle 1 81676 7389238 26851
## - gdp 1 88746 7396308 26854
## - area 1 92525 7400087 26856
## - wden 1 93452 7401013 26856
## - sea 1 132690 7440252 26875
## - csor 1 2414628 9722190 27803
## - cres 1 4080862 11388424 28353
##
## Step: AIC=26812.11
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## urb + fee + sample + organic + paper + glass + wood + plastic +
## raee + texile + msw_so + msw_un + sor + geo + roads + s_wteregio +
## s_landfill + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - fee 1 36 7307631 26804
## - urb 1 385 7307980 26804
## - msw_un 1 417 7308012 26804
## - s_landfill 1 567 7308162 26804
## - sample 1 1153 7308748 26804
## - plastic 1 1249 7308844 26804
## - geo 1 1829 7309424 26805
## - s_wteregio 1 3679 7311274 26806
## - texile 1 3818 7311413 26806
## - roads 1 4096 7311691 26806
## - pop 1 4778 7312373 26806
## - msw_so 1 6838 7314433 26807
## - wood 1 7930 7315525 26808
## - paper 1 11686 7319281 26810
## - raee 1 11879 7319474 26810
## - istat 1 13716 7321311 26810
## - sor 1 16660 7324255 26812
## <none> 7307595 26812
## - wage 1 22717 7330312 26815
## - glass 1 34637 7342232 26820
## - organic 1 36831 7344426 26821
## - proads 1 39488 7347083 26823
## - pden 1 71957 7379551 26838
## - isle 1 81758 7389353 26843
## - gdp 1 88850 7396445 26846
## - area 1 92521 7400116 26848
## - wden 1 95580 7403175 26849
## - sea 1 132941 7440536 26866
## - csor 1 2414819 9722414 27795
## - cres 1 4080937 11388532 28344
##
## Step: AIC=26803.97
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## urb + sample + organic + paper + glass + wood + plastic +
## raee + texile + msw_so + msw_un + sor + geo + roads + s_wteregio +
## s_landfill + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - urb 1 372 7308003 26796
## - msw_un 1 410 7308041 26796
## - s_landfill 1 568 7308199 26796
## - sample 1 1117 7308748 26796
## - plastic 1 1258 7308889 26796
## - geo 1 1861 7309492 26797
## - s_wteregio 1 3781 7311412 26798
## - texile 1 3785 7311416 26798
## - roads 1 4092 7311723 26798
## - pop 1 4759 7312390 26798
## - msw_so 1 6820 7314452 26799
## - wood 1 8102 7315733 26800
## - paper 1 11651 7319282 26801
## - raee 1 11843 7319474 26801
## - istat 1 13899 7321530 26802
## - sor 1 17123 7324754 26804
## <none> 7307631 26804
## - wage 1 22759 7330390 26807
## - glass 1 34859 7342490 26812
## - organic 1 36922 7344553 26813
## - proads 1 39492 7347123 26814
## - pden 1 71939 7379570 26830
## - isle 1 81848 7389479 26834
## - gdp 1 88999 7396630 26838
## - area 1 92888 7400519 26840
## - wden 1 95556 7403187 26841
## - sea 1 133036 7440667 26858
## - csor 1 2417522 9725153 27788
## - cres 1 4086816 11394447 28338
##
## Step: AIC=26795.99
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## sample + organic + paper + glass + wood + plastic + raee +
## texile + msw_so + msw_un + sor + geo + roads + s_wteregio +
## s_landfill + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - msw_un 1 347 7308350 26788
## - s_landfill 1 547 7308550 26788
## - sample 1 1193 7309196 26788
## - plastic 1 1413 7309415 26788
## - geo 1 1787 7309790 26789
## - s_wteregio 1 3666 7311669 26790
## - texile 1 3850 7311853 26790
## - roads 1 3917 7311920 26790
## - pop 1 4596 7312599 26790
## - msw_so 1 6721 7314724 26791
## - wood 1 7986 7315989 26792
## - raee 1 11675 7319678 26793
## - paper 1 11728 7319731 26793
## - istat 1 14077 7322080 26794
## <none> 7308003 26796
## - sor 1 18713 7326716 26797
## - wage 1 22723 7330725 26799
## - glass 1 36070 7344072 26805
## - organic 1 41013 7349016 26807
## - proads 1 41210 7349212 26807
## - pden 1 71593 7379596 26822
## - isle 1 81920 7389922 26826
## - gdp 1 89624 7397627 26830
## - area 1 93770 7401773 26832
## - wden 1 96165 7404167 26833
## - sea 1 135032 7443035 26851
## - csor 1 2417452 9725455 27780
## - cres 1 4087611 11395614 28330
##
## Step: AIC=26788.01
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## sample + organic + paper + glass + wood + plastic + raee +
## texile + msw_so + sor + geo + roads + s_wteregio + s_landfill +
## gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - s_landfill 1 508 7308858 26780
## - sample 1 1121 7309470 26780
## - plastic 1 1366 7309715 26780
## - geo 1 1778 7310127 26781
## - s_wteregio 1 3576 7311926 26782
## - roads 1 3729 7312078 26782
## - texile 1 3826 7312175 26782
## - msw_so 1 7437 7315787 26783
## - wood 1 7817 7316167 26784
## - raee 1 11567 7319917 26785
## - paper 1 11694 7320043 26785
## - pop 1 13608 7321957 26786
## - istat 1 14003 7322352 26786
## <none> 7308350 26788
## - sor 1 18366 7326716 26789
## - wage 1 22672 7331022 26791
## - glass 1 35889 7344239 26797
## - organic 1 41121 7349470 26799
## - proads 1 41174 7349523 26799
## - pden 1 74436 7382785 26815
## - isle 1 82034 7390383 26819
## - gdp 1 89496 7397845 26822
## - area 1 93605 7401954 26824
## - wden 1 97597 7405947 26826
## - sea 1 136023 7444372 26844
## - csor 1 2421691 9730041 27774
## - cres 1 4123690 11432039 28333
##
## Step: AIC=26780.1
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## sample + organic + paper + glass + wood + plastic + raee +
## texile + msw_so + sor + geo + roads + s_wteregio + gdp +
## proads + wage
##
## Df Sum of Sq RSS AIC
## - sample 1 1261 7310118 26772
## - geo 1 1344 7310202 26773
## - plastic 1 1548 7310406 26773
## - roads 1 3561 7312419 26774
## - texile 1 4070 7312928 26774
## - s_wteregio 1 4079 7312937 26774
## - msw_so 1 7397 7316255 26776
## - wood 1 7560 7316418 26776
## - paper 1 11200 7320058 26777
## - raee 1 11463 7320321 26777
## - istat 1 13535 7322393 26778
## - pop 1 13636 7322494 26778
## <none> 7308858 26780
## - sor 1 18019 7326877 26780
## - wage 1 23661 7332519 26783
## - glass 1 36608 7345466 26789
## - organic 1 40726 7349584 26791
## - proads 1 42037 7350895 26792
## - pden 1 75369 7384227 26808
## - isle 1 81918 7390776 26811
## - gdp 1 90224 7399081 26814
## - area 1 94699 7403557 26817
## - wden 1 98023 7406881 26818
## - sea 1 137812 7446670 26837
## - csor 1 2431722 9740579 27769
## - cres 1 4128949 11437807 28327
##
## Step: AIC=26772.54
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + plastic + raee + texile +
## msw_so + sor + geo + roads + s_wteregio + gdp + proads +
## wage
##
## Df Sum of Sq RSS AIC
## - geo 1 492 7310610 26765
## - plastic 1 1090 7311209 26765
## - roads 1 3507 7313625 26766
## - texile 1 3875 7313993 26766
## - s_wteregio 1 6103 7316221 26767
## - msw_so 1 7173 7317291 26768
## - wood 1 7902 7318020 26768
## - raee 1 11401 7321520 26770
## - paper 1 11455 7321573 26770
## - istat 1 12882 7323001 26770
## - pop 1 13210 7323329 26771
## <none> 7310118 26772
## - sor 1 17190 7327308 26772
## - wage 1 22436 7332554 26775
## - glass 1 35350 7345468 26781
## - organic 1 40250 7350368 26784
## - proads 1 42461 7352580 26784
## - pden 1 74724 7384843 26800
## - isle 1 81011 7391129 26803
## - gdp 1 91593 7401711 26808
## - area 1 93448 7403566 26808
## - wden 1 97435 7407553 26810
## - sea 1 142450 7452568 26831
## - csor 1 2452332 9762450 27769
## - cres 1 4152512 11462631 28326
##
## Step: AIC=26764.62
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + plastic + raee + texile +
## msw_so + sor + roads + s_wteregio + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - plastic 1 1338 7311948 26757
## - roads 1 3628 7314238 26758
## - texile 1 4192 7314802 26758
## - s_wteregio 1 5655 7316265 26759
## - msw_so 1 7381 7317992 26760
## - wood 1 7480 7318090 26760
## - paper 1 11139 7321749 26762
## - raee 1 11333 7321944 26762
## - pop 1 13393 7324003 26763
## <none> 7310610 26765
## - sor 1 17295 7327905 26765
## - istat 1 21914 7332524 26767
## - wage 1 25679 7336289 26769
## - glass 1 35049 7345659 26773
## - organic 1 40277 7350887 26776
## - proads 1 42174 7352784 26776
## - pden 1 76012 7386622 26792
## - isle 1 80568 7391178 26794
## - gdp 1 91357 7401967 26800
## - area 1 93014 7403624 26800
## - wden 1 98273 7408883 26803
## - sea 1 142278 7452888 26823
## - csor 1 2457340 9767950 27763
## - cres 1 4152157 11462767 28318
##
## Step: AIC=26757.11
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + raee + texile + msw_so +
## sor + roads + s_wteregio + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - roads 1 3601 7315549 26751
## - texile 1 4195 7316143 26751
## - s_wteregio 1 5378 7317326 26752
## - wood 1 6677 7318626 26752
## - msw_so 1 7425 7319374 26752
## - paper 1 10681 7322629 26754
## - raee 1 11190 7323138 26754
## - pop 1 13575 7325523 26755
## - sor 1 16175 7328123 26757
## <none> 7311948 26757
## - istat 1 22598 7334546 26760
## - wage 1 25400 7337348 26761
## - glass 1 34183 7346131 26765
## - organic 1 39156 7351104 26768
## - proads 1 40940 7352889 26768
## - pden 1 77708 7389656 26786
## - isle 1 80617 7392565 26787
## - gdp 1 90063 7402012 26792
## - area 1 96187 7408136 26794
## - wden 1 100786 7412734 26796
## - sea 1 141683 7453631 26816
## - csor 1 2456252 9768200 27754
## - cres 1 4161056 11473005 28313
##
## Step: AIC=26750.66
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + raee + texile + msw_so +
## sor + s_wteregio + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - texile 1 4121 7319670 26744
## - s_wteregio 1 5673 7321222 26745
## - msw_so 1 6755 7322304 26746
## - wood 1 6850 7322399 26746
## - paper 1 10621 7326170 26748
## - raee 1 11424 7326973 26748
## - pop 1 14088 7329637 26749
## - sor 1 15695 7331244 26750
## <none> 7315549 26751
## - istat 1 23407 7338956 26754
## - wage 1 25530 7341080 26755
## - glass 1 33960 7349509 26759
## - organic 1 38018 7353567 26760
## - proads 1 38144 7353693 26761
## - pden 1 78073 7393622 26779
## - isle 1 80683 7396232 26781
## - gdp 1 86951 7402500 26784
## - area 1 92780 7408329 26786
## - wden 1 101202 7416751 26790
## - sea 1 140419 7455968 26808
## - csor 1 2460459 9776008 27749
## - cres 1 4177403 11492952 28311
##
## Step: AIC=26744.47
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + raee + msw_so + sor + s_wteregio +
## gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - s_wteregio 1 5149 7324819 26739
## - wood 1 6483 7326153 26739
## - msw_so 1 6936 7326606 26740
## - paper 1 10838 7330508 26742
## - raee 1 11188 7330858 26742
## - sor 1 13962 7333632 26743
## - pop 1 14314 7333984 26743
## <none> 7319670 26744
## - istat 1 23416 7343086 26747
## - wage 1 24886 7344556 26748
## - glass 1 32842 7352512 26752
## - organic 1 37538 7357208 26754
## - proads 1 38448 7358118 26754
## - pden 1 79355 7399025 26774
## - isle 1 80480 7400150 26774
## - gdp 1 85353 7405023 26777
## - area 1 92527 7412197 26780
## - wden 1 102441 7422111 26785
## - sea 1 143469 7463139 26804
## - csor 1 2471975 9791645 27746
## - cres 1 4195992 11515662 28310
##
## Step: AIC=26738.75
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + raee + msw_so + sor + gdp +
## proads + wage
##
## Df Sum of Sq RSS AIC
## - msw_so 1 6372 7331191 26734
## - paper 1 7989 7332808 26734
## - wood 1 8498 7333317 26735
## - raee 1 12255 7337074 26736
## - sor 1 12964 7337783 26737
## - pop 1 13635 7338454 26737
## <none> 7324819 26739
## - istat 1 21255 7346074 26741
## - wage 1 31211 7356029 26745
## - glass 1 34555 7359374 26747
## - organic 1 36785 7361603 26748
## - proads 1 40150 7364969 26750
## - pden 1 78912 7403731 26768
## - isle 1 79126 7403945 26768
## - gdp 1 87513 7412332 26772
## - area 1 100428 7425247 26778
## - wden 1 100856 7425675 26778
## - sea 1 172129 7496948 26811
## - csor 1 2516217 9841036 27756
## - cres 1 4234724 11559543 28315
##
## Step: AIC=26733.62
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + paper + glass + wood + raee + sor + gdp + proads +
## wage
##
## Df Sum of Sq RSS AIC
## - paper 1 7905 7339096 26729
## - wood 1 8333 7339524 26729
## - raee 1 11312 7342503 26731
## - sor 1 16976 7348168 26734
## <none> 7331191 26734
## - pop 1 19265 7350456 26735
## - istat 1 21722 7352913 26736
## - wage 1 31249 7362440 26740
## - glass 1 34926 7366117 26742
## - proads 1 39714 7370905 26744
## - organic 1 42319 7373510 26746
## - isle 1 79049 7410240 26763
## - gdp 1 87578 7418769 26767
## - pden 1 90444 7421635 26768
## - area 1 105528 7436719 26775
## - wden 1 112908 7444100 26778
## - sea 1 169201 7500392 26805
## - csor 1 2521263 9852455 27752
## - cres 1 4247656 11578847 28312
##
## Step: AIC=26729.21
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + glass + wood + raee + sor + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - wood 1 7232 7346328 26724
## - sor 1 10230 7349326 26726
## - raee 1 11736 7350833 26727
## <none> 7339096 26729
## - pop 1 20051 7359148 26730
## - istat 1 21515 7360611 26731
## - wage 1 33012 7372108 26737
## - glass 1 33231 7372327 26737
## - organic 1 36138 7375234 26738
## - proads 1 39697 7378793 26740
## - isle 1 79633 7418729 26758
## - pden 1 88531 7427627 26763
## - gdp 1 90272 7429368 26764
## - area 1 104054 7443150 26770
## - wden 1 111658 7450754 26774
## - sea 1 169403 7508499 26800
## - csor 1 2563792 9902888 27761
## - cres 1 4273723 11612819 28314
##
## Step: AIC=26724.48
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + glass + raee + sor + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - sor 1 6274 7352602 26719
## - raee 1 9353 7355681 26721
## - istat 1 16185 7362513 26724
## <none> 7346328 26724
## - pop 1 19244 7365572 26725
## - glass 1 27833 7374161 26730
## - organic 1 31826 7378154 26731
## - wage 1 36644 7382972 26734
## - proads 1 43714 7390042 26737
## - isle 1 79369 7425697 26754
## - pden 1 84033 7430361 26756
## - gdp 1 98082 7444410 26762
## - wden 1 106728 7453056 26766
## - area 1 106926 7453254 26766
## - sea 1 179681 7526009 26800
## - csor 1 2573312 9919640 27759
## - cres 1 4285415 11631744 28312
##
## Step: AIC=26719.29
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + glass + raee + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## - raee 1 13700 7366302 26718
## <none> 7352602 26719
## - istat 1 19298 7371900 26720
## - pop 1 21345 7373947 26721
## - glass 1 22834 7375436 26722
## - organic 1 25762 7378364 26723
## - wage 1 33608 7386210 26727
## - proads 1 38026 7390628 26729
## - isle 1 79252 7431854 26748
## - pden 1 89746 7442348 26753
## - gdp 1 91956 7444558 26754
## - area 1 114020 7466622 26765
## - wden 1 115424 7468026 26765
## - sea 1 178297 7530899 26794
## - csor 1 2728190 10080792 27807
## - cres 1 4622599 11975201 28405
##
## Step: AIC=26717.6
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden +
## organic + glass + gdp + proads + wage
##
## Df Sum of Sq RSS AIC
## <none> 7366302 26718
## - glass 1 20830 7387132 26719
## - pop 1 22076 7388378 26720
## - organic 1 23585 7389887 26720
## - istat 1 25238 7391540 26721
## - wage 1 29337 7395639 26723
## - proads 1 39790 7406092 26728
## - isle 1 86137 7452439 26750
## - gdp 1 86385 7452688 26750
## - pden 1 88293 7454596 26751
## - wden 1 113035 7479338 26762
## - area 1 119260 7485562 26765
## - sea 1 175306 7541608 26791
## - csor 1 2741206 10107508 27808
## - cres 1 4613246 11979548 28398
summary(lm_bwd)
##
## Call:
## lm(formula = tc ~ cres + csor + istat + area + pop + isle + sea +
## pden + wden + organic + glass + gdp + proads + wage, data = train_data[,
## 4:(ncol(train_data) - 1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -822.04 -24.76 -7.73 15.76 656.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.473e+01 4.847e+00 13.354 < 2e-16 ***
## cres 9.391e-01 2.018e-02 46.529 < 2e-16 ***
## csor 8.918e-01 2.486e-02 35.867 < 2e-16 ***
## istat -9.586e-05 2.785e-05 -3.442 0.000585 ***
## area 1.293e-01 1.728e-02 7.481 9.29e-14 ***
## pop -5.482e-05 1.703e-05 -3.219 0.001300 **
## isle 6.847e+01 1.077e+01 6.358 2.31e-10 ***
## sea 2.097e+01 2.312e+00 9.070 < 2e-16 ***
## pden -3.413e-02 5.303e-03 -6.437 1.39e-10 ***
## wden 8.411e-05 1.155e-05 7.283 4.01e-13 ***
## organic -2.103e-01 6.322e-02 -3.327 0.000887 ***
## glass -6.794e-01 2.173e-01 -3.127 0.001783 **
## gdp 4.356e+00 6.841e-01 6.367 2.18e-10 ***
## proads -3.048e+00 7.054e-01 -4.321 1.60e-05 ***
## wage -2.261e+00 6.092e-01 -3.711 0.000210 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.16 on 3457 degrees of freedom
## Multiple R-squared: 0.6293, Adjusted R-squared: 0.6278
## F-statistic: 419.2 on 14 and 3457 DF, p-value: < 2.2e-16
names(coef(lm_full))
## [1] "(Intercept)" "cres" "csor" "istat" "area"
## [6] "pop" "alt" "isle" "sea" "pden"
## [11] "wden" "urb" "feeSTANDARD" "d_fee" "sample"
## [16] "organic" "paper" "glass" "wood" "metal"
## [21] "plastic" "raee" "texile" "other" "msw_so"
## [26] "msw_un" "msw" "sor" "geo" "roads"
## [31] "s_wteregio" "s_landfill" "gdp" "proads" "wage"
## [36] "finance"
names(coef(lm_bwd))
## [1] "(Intercept)" "cres" "csor" "istat" "area"
## [6] "pop" "isle" "sea" "pden" "wden"
## [11] "organic" "glass" "gdp" "proads" "wage"
lm_full_pred <- predict(lm_full, newdata=test_data)
lm_bwd_pred <- predict(lm_bwd, newdata=test_data)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
print(accuracy(lm_full_pred, test_data$tc))
## ME RMSE MAE MPE MAPE
## Test set -0.03499095 41.92483 27.7932 -6.059496 18.49293
Visual comparison
plot_dat <- cbind.data.frame(lm_bwd_pred, test_data$tc)
names(plot_dat) <- c("predicted", "actual")
ggplot(plot_dat, aes ( x = predicted, y = actual)) +
geom_point() +
geom_smooth() +
xlim(-100, 850) +
ylim(-100, 850) +
geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
print(accuracy(lm_bwd_pred, test_data$tc))
## ME RMSE MAE MPE MAPE
## Test set -0.1313893 41.92336 27.80371 -6.111286 18.4673
#install.packages("ggplot2")
#install.packages("rpart") # Popular decision tree algorithm
#install.packages("rattle") # Fancy tree plot
#install.packages("rpart.plot") # Enhanced tree plots
#install.packages("RColorBrewer") # Color selection for fancy tree plot
#install.packages("party") # Alternative decision tree algorithm
#install.packages("partykit") # Convert rpart object to BinaryTree
#install.packages("caret")
# install.packages("splitstackshape")
library(ggplot2)
library(rpart) # Popular decision tree algorithm
library(rattle) # Fancy tree plot
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot) # Enhanced tree plots
library(RColorBrewer) # Color selection for fancy tree plot
library(party) # Alternative decision tree algorithm
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit) # Convert rpart object to BinaryTree
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
library(caret)
## Loading required package: lattice
library(reshape2) # Load reshape 2 for melting
library(DMwR) # Load data mining with R for SMOTE
library(splitstackshape) # Used for stratified sampling
# Build the decision tree model
tree_model <- rpart(tc ~ ., data = train_data[, 4:(ncol(train_data) - 1)], method = "anova") # 'anova' for continuous target
# Visualize the tree
rpart.plot(tree_model, type = 3, digits = 3, fallen.leaves = TRUE, cex = 0.7)
Interpretation
Key Components of the Tree: Nodes:
Each internal node (rectangular split) represents a decision based on a specific feature (e.g., cres, csor, pop). The leaf nodes (rounded at the bottom) represent the final predictions or outputs. Splits:
Each split is a condition on a feature. For example: cres < 124 is the first split, dividing the data based on whether this condition is true or false. These splits help isolate subsets of data with similar characteristics. Values in Leaves:
The numbers in each leaf represent the predicted values or proportions for the dependent variable. For example: 103 (20.1%) means this leaf has a predicted value of 103, and 20.1% of the data falls into this category.
The feature at the top (cres) is the most important for splitting the data, as it’s used in the root node. Other features like csor and pop are less influential but still important for refining predictions.
fancyRpartPlot(tree_model) # Plot fancy tree
tree_model
## n= 3472
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3472 19871340.0 154.1717
## 2) cres< 124.31 3255 10130030.0 144.2435
## 4) csor< 91.035 2975 6885656.0 137.0712
## 8) cres< 41.905 1640 2431924.0 119.5457
## 16) csor< 44.925 697 756329.6 103.2627 *
## 17) csor>=44.925 943 1354202.0 131.5810 *
## 9) cres>=41.905 1335 3331219.0 158.6007
## 18) csor< 55.985 931 1992266.0 147.3217 *
## 19) csor>=55.985 404 947579.5 184.5926
## 38) cres< 82.83 331 494683.0 173.0153 *
## 39) cres>=82.83 73 207365.9 237.0873 *
## 5) csor>=91.035 280 1465313.0 220.4486
## 10) cres< 69.665 217 480695.8 195.6440 *
## 11) cres>=69.665 63 391227.5 305.8865 *
## 3) cres>=124.31 217 4607846.0 303.0945
## 6) csor< 141.24 199 3051299.0 281.5866
## 12) cres< 178.465 141 1036042.0 243.6839
## 24) csor< 38.045 91 242300.8 211.2895 *
## 25) csor>=38.045 50 524444.3 302.6418 *
## 13) cres>=178.465 58 1320256.0 373.7295
## 26) pop>=305 49 463768.8 338.7927 *
## 27) pop< 305 9 471054.1 563.9411 *
## 7) csor>=141.24 18 446775.2 540.8756 *
summary(tree_model)
## Call:
## rpart(formula = tc ~ ., data = train_data[, 4:(ncol(train_data) -
## 1)], method = "anova")
## n= 3472
##
## CP nsplit rel error xerror xstd
## 1 0.25833528 0 1.0000000 1.0009003 0.07612054
## 2 0.08952885 1 0.7416647 0.7620112 0.04984418
## 3 0.05648907 2 0.6521359 0.6836191 0.04732935
## 4 0.05584786 3 0.5956468 0.6469274 0.04461505
## 5 0.03497502 4 0.5397989 0.6001122 0.04385275
## 6 0.02986160 5 0.5048239 0.6014537 0.04607272
## 7 0.01969533 6 0.4749623 0.5626851 0.04473741
## 8 0.01939641 7 0.4552670 0.5174028 0.03772581
## 9 0.01617368 8 0.4358706 0.5056735 0.03762118
## 10 0.01355203 9 0.4196969 0.4979615 0.03751073
## 11 0.01235602 10 0.4061449 0.4860922 0.03653164
## 12 0.01000000 11 0.3937889 0.4770283 0.03539802
##
## Variable importance
## cres csor sor pop finance gdp metal msw_un organic msw
## 49 25 4 3 2 2 2 2 2 2
## paper geo pden alt
## 2 2 1 1
##
## Node number 1: 3472 observations, complexity param=0.2583353
## mean=154.1717, MSE=5723.313
## left son=2 (3255 obs) right son=3 (217 obs)
## Primary splits:
## cres < 124.31 to the left, improve=0.25833530, (0 missing)
## csor < 112.795 to the left, improve=0.15195620, (0 missing)
## sea < 0.5 to the left, improve=0.12044290, (0 missing)
## gdp < 7.325502 to the left, improve=0.09884944, (0 missing)
## finance < 7.545267 to the left, improve=0.09884944, (0 missing)
## Surrogate splits:
## csor < 205 to the left, agree=0.939, adj=0.023, (0 split)
## msw_so < 8552.5 to the right, agree=0.938, adj=0.009, (0 split)
## pop < 72.5 to the right, agree=0.938, adj=0.005, (0 split)
## sor < 0.71 to the right, agree=0.938, adj=0.005, (0 split)
##
## Node number 2: 3255 observations, complexity param=0.08952885
## mean=144.2435, MSE=3112.144
## left son=4 (2975 obs) right son=5 (280 obs)
## Primary splits:
## csor < 91.035 to the left, improve=0.17562230, (0 missing)
## cres < 48.795 to the left, improve=0.16756350, (0 missing)
## sea < 0.5 to the left, improve=0.11856350, (0 missing)
## s_landfill < 13.27579 to the left, improve=0.09471711, (0 missing)
## geo < 2.5 to the right, improve=0.09379906, (0 missing)
## Surrogate splits:
## istat < 111099.5 to the left, agree=0.915, adj=0.007, (0 split)
## paper < 40.16635 to the left, agree=0.915, adj=0.007, (0 split)
## gdp < 9.076158 to the left, agree=0.914, adj=0.004, (0 split)
## finance < 9.348442 to the left, agree=0.914, adj=0.004, (0 split)
##
## Node number 3: 217 observations, complexity param=0.05584786
## mean=303.0945, MSE=21234.31
## left son=6 (199 obs) right son=7 (18 obs)
## Primary splits:
## csor < 141.24 to the left, improve=0.2408440, (0 missing)
## cres < 172.045 to the left, improve=0.2118968, (0 missing)
## gdp < 8.290743 to the left, improve=0.1875395, (0 missing)
## finance < 8.539466 to the left, improve=0.1875395, (0 missing)
## alt < 7.5 to the right, improve=0.1209677, (0 missing)
##
## Node number 4: 2975 observations, complexity param=0.05648907
## mean=137.0712, MSE=2314.506
## left son=8 (1640 obs) right son=9 (1335 obs)
## Primary splits:
## cres < 41.905 to the left, improve=0.16302200, (0 missing)
## geo < 2.5 to the right, improve=0.11420950, (0 missing)
## istat < 31024.5 to the left, improve=0.10912740, (0 missing)
## s_landfill < 13.27579 to the left, improve=0.10504870, (0 missing)
## msw_un < 1533840 to the left, improve=0.09428845, (0 missing)
## Surrogate splits:
## sor < 66.735 to the right, agree=0.748, adj=0.439, (0 split)
## metal < 1.235346 to the right, agree=0.665, adj=0.254, (0 split)
## organic < 20.68348 to the right, agree=0.657, adj=0.236, (0 split)
## geo < 2.5 to the right, agree=0.651, adj=0.223, (0 split)
## paper < 9.066684 to the right, agree=0.648, adj=0.216, (0 split)
##
## Node number 5: 280 observations, complexity param=0.0298616
## mean=220.4486, MSE=5233.262
## left son=10 (217 obs) right son=11 (63 obs)
## Primary splits:
## cres < 69.665 to the left, improve=0.40495780, (0 missing)
## csor < 145.89 to the left, improve=0.15949150, (0 missing)
## sea < 0.5 to the left, improve=0.13084540, (0 missing)
## alt < 15.5 to the right, improve=0.10410440, (0 missing)
## istat < 22520 to the right, improve=0.08625579, (0 missing)
## Surrogate splits:
## alt < 4.5 to the right, agree=0.789, adj=0.063, (0 split)
## pden < 11.1302 to the right, agree=0.789, adj=0.063, (0 split)
## sor < 42.125 to the right, agree=0.789, adj=0.063, (0 split)
## csor < 160.715 to the left, agree=0.786, adj=0.048, (0 split)
## area < 2.715 to the right, agree=0.786, adj=0.048, (0 split)
##
## Node number 6: 199 observations, complexity param=0.03497502
## mean=281.5866, MSE=15333.16
## left son=12 (141 obs) right son=13 (58 obs)
## Primary splits:
## cres < 178.465 to the left, improve=0.2277721, (0 missing)
## gdp < 8.290991 to the left, improve=0.2274784, (0 missing)
## finance < 8.539721 to the left, improve=0.2274784, (0 missing)
## csor < 37.905 to the left, improve=0.1698224, (0 missing)
## pop < 220.5 to the right, improve=0.1060024, (0 missing)
## Surrogate splits:
## sor < 23.885 to the right, agree=0.734, adj=0.086, (0 split)
## gdp < 8.502975 to the left, agree=0.734, adj=0.086, (0 split)
## finance < 8.758064 to the left, agree=0.734, adj=0.086, (0 split)
## isle < 0.5 to the left, agree=0.729, adj=0.069, (0 split)
## glass < 1.93376 to the right, agree=0.729, adj=0.069, (0 split)
##
## Node number 7: 18 observations
## mean=540.8756, MSE=24820.84
##
## Node number 8: 1640 observations, complexity param=0.01617368
## mean=119.5457, MSE=1482.881
## left son=16 (697 obs) right son=17 (943 obs)
## Primary splits:
## csor < 44.925 to the left, improve=0.13215570, (0 missing)
## s_landfill < 13.27579 to the left, improve=0.12435610, (0 missing)
## istat < 31514 to the left, improve=0.08575976, (0 missing)
## geo < 2.5 to the right, improve=0.08087859, (0 missing)
## msw_un < 802895 to the left, improve=0.06796906, (0 missing)
## Surrogate splits:
## plastic < 3.762071 to the left, agree=0.618, adj=0.102, (0 split)
## sor < 60.725 to the left, agree=0.599, adj=0.057, (0 split)
## istat < 24564 to the left, agree=0.599, adj=0.056, (0 split)
## alt < 616.5 to the right, agree=0.599, adj=0.056, (0 split)
## metal < 3.097131 to the right, agree=0.596, adj=0.050, (0 split)
##
## Node number 9: 1335 observations, complexity param=0.01969533
## mean=158.6007, MSE=2495.295
## left son=18 (931 obs) right son=19 (404 obs)
## Primary splits:
## csor < 55.985 to the left, improve=0.11748630, (0 missing)
## cres < 65.84 to the left, improve=0.08717904, (0 missing)
## sea < 0.5 to the left, improve=0.08667848, (0 missing)
## msw_un < 1536435 to the left, improve=0.08513964, (0 missing)
## area < 80.545 to the left, improve=0.06546742, (0 missing)
## Surrogate splits:
## wood < 8.147758 to the left, agree=0.706, adj=0.027, (0 split)
## msw_so < 25241050 to the left, agree=0.705, adj=0.025, (0 split)
## msw_un < 4181055 to the left, agree=0.704, adj=0.022, (0 split)
## msw < 17460710 to the left, agree=0.703, adj=0.020, (0 split)
## pop < 41309.5 to the left, agree=0.703, adj=0.017, (0 split)
##
## Node number 10: 217 observations
## mean=195.644, MSE=2215.188
##
## Node number 11: 63 observations
## mean=305.8865, MSE=6209.96
##
## Node number 12: 141 observations, complexity param=0.01355203
## mean=243.6839, MSE=7347.817
## left son=24 (91 obs) right son=25 (50 obs)
## Primary splits:
## csor < 38.045 to the left, improve=0.25992870, (0 missing)
## alt < 43 to the right, improve=0.17914240, (0 missing)
## gdp < 7.857229 to the left, improve=0.11904880, (0 missing)
## finance < 8.092946 to the left, improve=0.11904880, (0 missing)
## wage < 9.405493 to the left, improve=0.09072908, (0 missing)
## Surrogate splits:
## alt < 15.5 to the right, agree=0.716, adj=0.20, (0 split)
## wage < 9.517115 to the left, agree=0.709, adj=0.18, (0 split)
## istat < 16223 to the right, agree=0.681, adj=0.10, (0 split)
## glass < 11.00022 to the left, agree=0.674, adj=0.08, (0 split)
## plastic < 10.80984 to the left, agree=0.674, adj=0.08, (0 split)
##
## Node number 13: 58 observations, complexity param=0.01939641
## mean=373.7295, MSE=22763.03
## left son=26 (49 obs) right son=27 (9 obs)
## Primary splits:
## pop < 305 to the right, improve=0.2919379, (0 missing)
## gdp < 8.306887 to the left, improve=0.2779149, (0 missing)
## finance < 8.556094 to the left, improve=0.2779149, (0 missing)
## msw_un < 119585 to the right, improve=0.2313858, (0 missing)
## pden < 13.36629 to the right, improve=0.1955234, (0 missing)
## Surrogate splits:
## msw_un < 133180 to the right, agree=0.948, adj=0.667, (0 split)
## msw < 195999.5 to the right, agree=0.948, adj=0.667, (0 split)
## gdp < 8.614421 to the left, agree=0.948, adj=0.667, (0 split)
## finance < 8.872854 to the left, agree=0.948, adj=0.667, (0 split)
## pden < 10.27217 to the right, agree=0.914, adj=0.444, (0 split)
##
## Node number 16: 697 observations
## mean=103.2627, MSE=1085.121
##
## Node number 17: 943 observations
## mean=131.581, MSE=1436.057
##
## Node number 18: 931 observations
## mean=147.3217, MSE=2139.921
##
## Node number 19: 404 observations, complexity param=0.01235602
## mean=184.5926, MSE=2345.494
## left son=38 (331 obs) right son=39 (73 obs)
## Primary splits:
## cres < 82.83 to the left, improve=0.25911350, (0 missing)
## sea < 0.5 to the left, improve=0.12954860, (0 missing)
## msw_un < 520535 to the left, improve=0.09093708, (0 missing)
## sor < 50.385 to the right, improve=0.07881458, (0 missing)
## organic < 18.56773 to the right, improve=0.07301852, (0 missing)
## Surrogate splits:
## plastic < 1.673275 to the right, agree=0.832, adj=0.068, (0 split)
## sor < 45.985 to the right, agree=0.832, adj=0.068, (0 split)
## istat < 9020 to the right, agree=0.827, adj=0.041, (0 split)
## isle < 0.5 to the left, agree=0.827, adj=0.041, (0 split)
## msw_so < 32852 to the right, agree=0.827, adj=0.041, (0 split)
##
## Node number 24: 91 observations
## mean=211.2895, MSE=2662.646
##
## Node number 25: 50 observations
## mean=302.6418, MSE=10488.89
##
## Node number 26: 49 observations
## mean=338.7927, MSE=9464.67
##
## Node number 27: 9 observations
## mean=563.9411, MSE=52339.34
##
## Node number 38: 331 observations
## mean=173.0153, MSE=1494.51
##
## Node number 39: 73 observations
## mean=237.0873, MSE=2840.628
printcp(tree_model)
##
## Regression tree:
## rpart(formula = tc ~ ., data = train_data[, 4:(ncol(train_data) -
## 1)], method = "anova")
##
## Variables actually used in tree construction:
## [1] cres csor pop
##
## Root node error: 19871343/3472 = 5723.3
##
## n= 3472
##
## CP nsplit rel error xerror xstd
## 1 0.258335 0 1.00000 1.00090 0.076121
## 2 0.089529 1 0.74166 0.76201 0.049844
## 3 0.056489 2 0.65214 0.68362 0.047329
## 4 0.055848 3 0.59565 0.64693 0.044615
## 5 0.034975 4 0.53980 0.60011 0.043853
## 6 0.029862 5 0.50482 0.60145 0.046073
## 7 0.019695 6 0.47496 0.56269 0.044737
## 8 0.019396 7 0.45527 0.51740 0.037726
## 9 0.016174 8 0.43587 0.50567 0.037621
## 10 0.013552 9 0.41970 0.49796 0.037511
## 11 0.012356 10 0.40614 0.48609 0.036532
## 12 0.010000 11 0.39379 0.47703 0.035398
plotcp(tree_model)
tree_model_2 <- rpart(tc ~., # Set tree formula
data = train_data[, 4:(ncol(train_data) - 1)], # Set data
control = rpart.control(cp = 0.022)) # Set parameters
fancyRpartPlot(tree_model_2) # Plot fancy tree
#install.packages("randomForest")
#install.packages("caret")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
##
## xgboost
Best Random Forest
rf_mod <- randomForest(tc ~., # Set tree formula
data = train_data[, 4:(ncol(train_data) - 1)], # Set dataset
ntree = 200,
nodesize = 1,
mtry = 12) # Set number of trees to use
rf_preds <- predict(rf_mod, test_data) # Create predictions for random forest model
plot_dat <- cbind.data.frame(rf_preds, test_data$tc)
names(plot_dat) <- c("predicted", "actual")
ggplot(plot_dat, aes ( x = predicted, y = actual)) +
geom_point() +
geom_smooth() +
xlim(-100, 850) +
ylim(-100, 850) +
geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Interpretation
When we compare this model rf_preds compared to
lm_bwd_pred we can see that rf_preds
prediction is even closer to the actual compared to the
lm_bwd_pred model.
library(fastDummies)
dummy_data <- dummy_cols(waste_data[, 5:(ncol(waste_data) - 1)], remove_selected_columns = TRUE)
train_dummy <- dummy_data[train_data_indices,]
test_dummy <- dummy_data[-train_data_indices,]
dtrain <- xgb.DMatrix(data = as.matrix(train_dummy), label = as.numeric(train_data$tc))
# Create test matrix
dtest <- xgb.DMatrix(data = as.matrix(test_dummy), label = as.numeric(test_data$tc))
XGBoost
set.seed(111111)
bst_1 <- xgboost(data = dtrain, # Set training data
nrounds = 100, # Set number of rounds
verbose = 1, # 1 - Prints out fit
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:124.492289
## [21] train-rmse:17.851580
## [41] train-rmse:11.960609
## [61] train-rmse:8.847052
## [81] train-rmse:6.697660
## [100] train-rmse:5.242919
bst_preds <- predict(bst_1, dtest)
print(accuracy(bst_preds, test_data$tc))
## ME RMSE MAE MPE MAPE
## Test set 1.30633 41.37576 24.23526 -3.476268 15.34897
MEA has gone down
plot_dat <- cbind.data.frame(bst_preds, test_data$tc)
names(plot_dat) <- c("predicted", "actual")
ggplot(plot_dat, aes ( x = predicted, y = actual)) +
geom_point() +
geom_smooth() +
xlim(-100, 850) +
ylim(-100, 850) +
geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Interpretation Even better predicition
# Extract importance
imp_mat <- xgb.importance(model = bst_1)
# Plot importance (top 10 variables)
xgb.plot.importance(imp_mat, top_n = 10)
Interpretation cres and csor have the highest
importance scores, indicating that these features contribute the most to
the total cost predictions. These variables likely have strong
predictive power or influence over the target variable
source("~/Downloads/a_insights_shap_functions.r")
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = bst_1,
X_train =as.matrix(train_dummy),
shap_approx = F)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:zoo':
##
## yearmon, yearqtr
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## make SHAP score by decreasing order
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(train_dummy),
top_n = 10)
## Loading required package: ggforce
plot.shap.summary(data_long = shap_long)
Interpretation cres has the highest average SHAP value,
indicating it is the most influential feature in the model’s
predictions.
lead to higher cost
max_depth_vals <- c(3, 5, 7, 10, 15) # Create vector of max depth values
min_child_weight <- c(1,3,5,7, 10, 15) # Create vector of min child values
# Expand grid of parameter values
cv_params <- expand.grid(max_depth_vals, min_child_weight)
names(cv_params) <- c("max_depth", "min_child_weight")
# Create results vector
rmse_vec <- rep(NA, nrow(cv_params))
# Loop through results
for(i in 1:nrow(cv_params)){
set.seed(111111)
bst_tune <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
max.depth = cv_params$max_depth[i], # Set max depth
min_child_weight = cv_params$min_child_weight[i], # Set minimum number of samples in node to split
nrounds = 400, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
rmse_vec[i] <- bst_tune$evaluation_log$test_rmse_mean[bst_tune$best_ntreelimit]
}
## [1] train-rmse:155.923219+0.995006 test-rmse:156.025261+4.444763
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:45.093593+0.530261 test-rmse:50.106003+5.138494
## [41] train-rmse:35.235581+0.556793 test-rmse:43.258349+4.899167
## [61] train-rmse:32.371356+0.653130 test-rmse:41.639124+4.396476
## [81] train-rmse:30.484983+0.554332 test-rmse:40.996523+4.631632
## [101] train-rmse:28.926941+0.569101 test-rmse:40.495324+4.641312
## [121] train-rmse:27.633749+0.495846 test-rmse:40.267839+4.679667
## [141] train-rmse:26.471459+0.504642 test-rmse:40.001346+4.595750
## [161] train-rmse:25.388633+0.477942 test-rmse:39.800069+4.645083
## [181] train-rmse:24.459427+0.461160 test-rmse:39.546817+4.538143
## [201] train-rmse:23.575082+0.445557 test-rmse:39.338964+4.456426
## [221] train-rmse:22.874367+0.437115 test-rmse:39.236728+4.476173
## [241] train-rmse:22.140840+0.375995 test-rmse:39.118551+4.497049
## [261] train-rmse:21.433876+0.350244 test-rmse:38.973601+4.543437
## [281] train-rmse:20.872481+0.371997 test-rmse:38.913844+4.534199
## [301] train-rmse:20.314922+0.355746 test-rmse:38.880581+4.580636
## [321] train-rmse:19.767130+0.307719 test-rmse:38.820683+4.627091
## [341] train-rmse:19.237818+0.321120 test-rmse:38.736026+4.651512
## [361] train-rmse:18.721295+0.280269 test-rmse:38.654334+4.644152
## [381] train-rmse:18.243752+0.288682 test-rmse:38.591976+4.649406
## [400] train-rmse:17.863479+0.247630 test-rmse:38.618163+4.675198
## [1] train-rmse:155.635524+0.990844 test-rmse:155.884116+4.459781
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:37.306239+0.359437 test-rmse:47.834205+6.275392
## [41] train-rmse:25.628996+0.378135 test-rmse:41.570322+6.165153
## [61] train-rmse:21.848797+0.726270 test-rmse:40.344041+6.261118
## [81] train-rmse:19.547466+0.842682 test-rmse:39.870137+6.408156
## [101] train-rmse:17.610416+0.709637 test-rmse:39.640529+6.542050
## [121] train-rmse:16.077955+0.725726 test-rmse:39.437318+6.571280
## [141] train-rmse:14.808825+0.651605 test-rmse:39.294608+6.521086
## [161] train-rmse:13.628794+0.578398 test-rmse:39.172874+6.546034
## [181] train-rmse:12.582638+0.511526 test-rmse:39.105073+6.556987
## [201] train-rmse:11.711077+0.476856 test-rmse:39.092578+6.584332
## [221] train-rmse:10.865240+0.435333 test-rmse:39.029026+6.573390
## [241] train-rmse:10.102985+0.420194 test-rmse:39.005012+6.583835
## [261] train-rmse:9.417695+0.386754 test-rmse:38.986637+6.555736
## [281] train-rmse:8.759138+0.333634 test-rmse:38.990313+6.560016
## [301] train-rmse:8.236156+0.329632 test-rmse:38.965035+6.564122
## [321] train-rmse:7.678067+0.284506 test-rmse:38.941227+6.561929
## [341] train-rmse:7.174577+0.263691 test-rmse:38.913617+6.572939
## [361] train-rmse:6.674457+0.243478 test-rmse:38.888334+6.561878
## Stopping. Best iteration:
## [358] train-rmse:6.742673+0.243088 test-rmse:38.887928+6.561904
##
## [1] train-rmse:155.540935+0.988894 test-rmse:155.821506+4.448783
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:32.089768+0.334097 test-rmse:47.733685+6.892087
## [41] train-rmse:17.782253+0.390725 test-rmse:41.852103+6.991691
## [61] train-rmse:13.925996+0.451093 test-rmse:40.981263+6.764888
## [81] train-rmse:11.372034+0.352159 test-rmse:40.708358+6.692603
## [101] train-rmse:9.366618+0.311467 test-rmse:40.528106+6.652575
## [121] train-rmse:7.828062+0.282044 test-rmse:40.474379+6.633743
## [141] train-rmse:6.567936+0.169459 test-rmse:40.410489+6.623649
## [161] train-rmse:5.551714+0.175513 test-rmse:40.388019+6.624430
## [181] train-rmse:4.725067+0.182829 test-rmse:40.382295+6.618079
## Stopping. Best iteration:
## [167] train-rmse:5.289942+0.156780 test-rmse:40.373794+6.625056
##
## [1] train-rmse:155.513517+0.987498 test-rmse:155.866687+4.438155
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:27.751263+0.188949 test-rmse:47.846481+6.046332
## [41] train-rmse:8.803718+0.157587 test-rmse:41.338966+5.158538
## [61] train-rmse:5.274136+0.104530 test-rmse:40.788000+4.842424
## [81] train-rmse:3.563999+0.174723 test-rmse:40.652491+4.785510
## [101] train-rmse:2.401036+0.198387 test-rmse:40.591708+4.769054
## [121] train-rmse:1.685683+0.176099 test-rmse:40.578015+4.750616
## [141] train-rmse:1.144809+0.105795 test-rmse:40.577573+4.753814
## Stopping. Best iteration:
## [138] train-rmse:1.212141+0.111338 test-rmse:40.575250+4.752949
##
## [1] train-rmse:155.513134+0.987639 test-rmse:155.869106+4.436871
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:26.662322+0.162039 test-rmse:48.713198+6.497648
## [41] train-rmse:5.869685+0.088411 test-rmse:41.783150+5.638744
## [61] train-rmse:1.631045+0.067167 test-rmse:41.371784+5.434476
## [81] train-rmse:0.587690+0.035217 test-rmse:41.279417+5.357672
## [101] train-rmse:0.261959+0.013676 test-rmse:41.234058+5.315414
## [121] train-rmse:0.134977+0.015741 test-rmse:41.217986+5.303442
## [141] train-rmse:0.070989+0.008293 test-rmse:41.212585+5.299911
## [161] train-rmse:0.035268+0.004723 test-rmse:41.210424+5.298446
## [181] train-rmse:0.018618+0.002794 test-rmse:41.209721+5.297941
## [201] train-rmse:0.009102+0.001996 test-rmse:41.209539+5.297960
## [221] train-rmse:0.004379+0.001168 test-rmse:41.209473+5.298011
## [241] train-rmse:0.002085+0.000639 test-rmse:41.209428+5.297991
## [261] train-rmse:0.001098+0.000238 test-rmse:41.209408+5.297987
## [281] train-rmse:0.000901+0.000061 test-rmse:41.209402+5.297987
## Stopping. Best iteration:
## [274] train-rmse:0.000901+0.000061 test-rmse:41.209402+5.297987
##
## [1] train-rmse:155.923219+0.995006 test-rmse:156.025261+4.444763
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:45.261486+0.576828 test-rmse:50.279596+5.122574
## [41] train-rmse:35.505271+0.690523 test-rmse:43.618239+5.115975
## [61] train-rmse:32.511876+0.557619 test-rmse:42.128886+4.986490
## [81] train-rmse:30.439803+0.452820 test-rmse:41.370000+5.277682
## [101] train-rmse:28.903413+0.369530 test-rmse:40.932430+5.472931
## [121] train-rmse:27.527842+0.394657 test-rmse:40.674812+5.508553
## [141] train-rmse:26.340412+0.449682 test-rmse:40.480507+5.625782
## [161] train-rmse:25.316333+0.436525 test-rmse:40.250515+5.600797
## [181] train-rmse:24.359816+0.376062 test-rmse:40.044755+5.694525
## [201] train-rmse:23.536687+0.338762 test-rmse:39.836554+5.739841
## [221] train-rmse:22.728821+0.345782 test-rmse:39.700271+5.824222
## [241] train-rmse:22.034300+0.374043 test-rmse:39.615204+5.876605
## [261] train-rmse:21.335760+0.342067 test-rmse:39.523368+5.953505
## [281] train-rmse:20.759094+0.371213 test-rmse:39.462717+5.926652
## [301] train-rmse:20.169969+0.361070 test-rmse:39.432883+5.988091
## [321] train-rmse:19.644173+0.335691 test-rmse:39.308383+5.980166
## [341] train-rmse:19.084196+0.316821 test-rmse:39.207546+6.023201
## [361] train-rmse:18.613470+0.327446 test-rmse:39.186557+6.017137
## [381] train-rmse:18.160355+0.308557 test-rmse:39.152158+6.012429
## [400] train-rmse:17.742841+0.320447 test-rmse:39.093390+6.038325
## [1] train-rmse:155.663935+0.989397 test-rmse:155.961315+4.434324
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:37.518236+0.234517 test-rmse:47.686368+5.941638
## [41] train-rmse:25.897748+0.353815 test-rmse:41.491408+5.804376
## [61] train-rmse:22.196835+0.411861 test-rmse:40.165419+5.687282
## [81] train-rmse:19.733499+0.516305 test-rmse:39.769057+5.626148
## [101] train-rmse:17.941086+0.492227 test-rmse:39.505729+5.515807
## [121] train-rmse:16.376199+0.494484 test-rmse:39.308983+5.535572
## [141] train-rmse:14.959684+0.382131 test-rmse:39.104045+5.526289
## [161] train-rmse:13.729137+0.291206 test-rmse:38.988209+5.471128
## [181] train-rmse:12.600224+0.277000 test-rmse:38.930072+5.479241
## [201] train-rmse:11.633413+0.330056 test-rmse:38.891789+5.498065
## [221] train-rmse:10.838557+0.307510 test-rmse:38.866125+5.514231
## [241] train-rmse:10.092801+0.298413 test-rmse:38.851638+5.511194
## [261] train-rmse:9.402903+0.283144 test-rmse:38.812336+5.505448
## Stopping. Best iteration:
## [256] train-rmse:9.569201+0.309314 test-rmse:38.807201+5.493952
##
## [1] train-rmse:155.573313+0.991363 test-rmse:155.896028+4.448180
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:32.514414+0.368874 test-rmse:47.358893+6.308671
## [41] train-rmse:17.839060+0.453235 test-rmse:41.641008+5.936443
## [61] train-rmse:14.144815+0.432039 test-rmse:40.940547+5.804612
## [81] train-rmse:11.572142+0.299441 test-rmse:40.704365+5.735263
## [101] train-rmse:9.664845+0.087070 test-rmse:40.570333+5.796825
## [121] train-rmse:8.228081+0.154941 test-rmse:40.476992+5.795857
## [141] train-rmse:7.119070+0.112683 test-rmse:40.450593+5.813408
## [161] train-rmse:6.166064+0.103957 test-rmse:40.429551+5.839250
## [181] train-rmse:5.327917+0.114596 test-rmse:40.416814+5.839338
## [201] train-rmse:4.500785+0.076409 test-rmse:40.397115+5.854001
## [221] train-rmse:3.851500+0.085858 test-rmse:40.386394+5.870107
## [241] train-rmse:3.319453+0.113850 test-rmse:40.385897+5.862929
## Stopping. Best iteration:
## [229] train-rmse:3.647222+0.106560 test-rmse:40.378329+5.865510
##
## [1] train-rmse:155.549737+0.989554 test-rmse:155.936311+4.452184
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:28.332592+0.137265 test-rmse:47.199421+6.344402
## [41] train-rmse:9.796114+0.453455 test-rmse:41.442207+6.089064
## [61] train-rmse:6.568337+0.685001 test-rmse:41.037371+5.973341
## [81] train-rmse:5.076299+0.559379 test-rmse:40.953091+5.987056
## [101] train-rmse:3.900479+0.520891 test-rmse:40.925124+6.009822
## [121] train-rmse:2.977942+0.611258 test-rmse:40.894707+6.015241
## [141] train-rmse:2.347657+0.661378 test-rmse:40.888336+6.027346
## [161] train-rmse:1.753699+0.675078 test-rmse:40.881569+6.016826
## [181] train-rmse:1.346052+0.632964 test-rmse:40.870926+6.014248
## [201] train-rmse:1.043443+0.560873 test-rmse:40.858707+6.006834
## [221] train-rmse:0.831430+0.476842 test-rmse:40.864434+6.011661
## Stopping. Best iteration:
## [209] train-rmse:0.954477+0.532639 test-rmse:40.857884+6.004300
##
## [1] train-rmse:155.549373+0.989849 test-rmse:155.936358+4.452212
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:27.391438+0.175560 test-rmse:47.506820+6.602628
## [41] train-rmse:6.935040+0.245195 test-rmse:41.943775+6.172862
## [61] train-rmse:3.128819+0.492549 test-rmse:41.666054+6.005776
## [81] train-rmse:2.040610+0.478121 test-rmse:41.643736+5.980612
## [101] train-rmse:1.496994+0.419558 test-rmse:41.614197+5.956891
## [121] train-rmse:1.143136+0.359897 test-rmse:41.609170+5.961252
## [141] train-rmse:0.860449+0.286058 test-rmse:41.593978+5.950864
## [161] train-rmse:0.638717+0.191981 test-rmse:41.573003+5.940876
## [181] train-rmse:0.461626+0.138180 test-rmse:41.556177+5.934262
## [201] train-rmse:0.333635+0.101461 test-rmse:41.546124+5.937699
## [221] train-rmse:0.245112+0.075627 test-rmse:41.532338+5.934505
## [241] train-rmse:0.178459+0.057546 test-rmse:41.526864+5.934436
## [261] train-rmse:0.126095+0.040577 test-rmse:41.524725+5.936657
## [281] train-rmse:0.090937+0.032245 test-rmse:41.525127+5.940141
## Stopping. Best iteration:
## [263] train-rmse:0.121731+0.038786 test-rmse:41.523801+5.935661
##
## [1] train-rmse:155.923219+0.995006 test-rmse:156.025261+4.444763
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:45.668214+0.784018 test-rmse:50.225266+5.309379
## [41] train-rmse:36.151563+0.927362 test-rmse:43.454034+5.095706
## [61] train-rmse:33.201758+0.762540 test-rmse:41.816532+5.078728
## [81] train-rmse:31.020702+0.619584 test-rmse:41.141445+5.193476
## [101] train-rmse:29.458874+0.529671 test-rmse:40.745794+5.290664
## [121] train-rmse:28.194370+0.442708 test-rmse:40.415710+5.321824
## [141] train-rmse:27.046179+0.462530 test-rmse:40.247833+5.352164
## [161] train-rmse:25.956812+0.535022 test-rmse:40.090386+5.441841
## [181] train-rmse:24.969629+0.489796 test-rmse:39.838965+5.472727
## [201] train-rmse:24.190740+0.429843 test-rmse:39.725495+5.527306
## [221] train-rmse:23.385090+0.392902 test-rmse:39.631204+5.590598
## [241] train-rmse:22.641646+0.270584 test-rmse:39.526910+5.635503
## [261] train-rmse:21.973537+0.228644 test-rmse:39.381268+5.695521
## [281] train-rmse:21.386466+0.221521 test-rmse:39.291151+5.736197
## [301] train-rmse:20.778860+0.220350 test-rmse:39.280137+5.867370
## [321] train-rmse:20.229012+0.209733 test-rmse:39.199894+5.847261
## [341] train-rmse:19.707310+0.195022 test-rmse:39.195720+5.851529
## [361] train-rmse:19.194668+0.184907 test-rmse:39.136194+5.924410
## [381] train-rmse:18.707829+0.191766 test-rmse:39.140376+5.947670
## [400] train-rmse:18.265134+0.194705 test-rmse:39.095356+5.939024
## [1] train-rmse:155.669957+0.992305 test-rmse:155.865894+4.419190
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.156101+0.437160 test-rmse:47.236984+5.563532
## [41] train-rmse:26.675235+0.447150 test-rmse:40.709864+5.166045
## [61] train-rmse:23.194599+0.381905 test-rmse:39.677992+5.215275
## [81] train-rmse:20.757428+0.418509 test-rmse:39.128852+5.300222
## [101] train-rmse:18.857263+0.465870 test-rmse:38.795034+5.482592
## [121] train-rmse:17.142871+0.385091 test-rmse:38.575392+5.474923
## [141] train-rmse:15.648157+0.414026 test-rmse:38.411545+5.562134
## [161] train-rmse:14.409243+0.327026 test-rmse:38.267741+5.535385
## [181] train-rmse:13.325006+0.323475 test-rmse:38.228548+5.545185
## [201] train-rmse:12.372816+0.320893 test-rmse:38.129854+5.564059
## [221] train-rmse:11.476654+0.250907 test-rmse:38.091468+5.550118
## [241] train-rmse:10.691176+0.193056 test-rmse:38.039471+5.557781
## [261] train-rmse:9.967667+0.144746 test-rmse:38.031159+5.562501
## [281] train-rmse:9.231921+0.123225 test-rmse:37.979834+5.551849
## [301] train-rmse:8.627938+0.116817 test-rmse:37.975413+5.547636
## [321] train-rmse:8.048858+0.124121 test-rmse:37.961351+5.538837
## [341] train-rmse:7.562784+0.100356 test-rmse:37.938533+5.520537
## [361] train-rmse:7.060562+0.045791 test-rmse:37.929285+5.502289
## [381] train-rmse:6.623472+0.045477 test-rmse:37.910498+5.491523
## [400] train-rmse:6.232936+0.052304 test-rmse:37.934381+5.489826
## [1] train-rmse:155.579931+0.994539 test-rmse:155.788298+4.425868
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:33.472561+0.401728 test-rmse:46.688645+6.019535
## [41] train-rmse:19.937952+0.467959 test-rmse:40.740099+4.993142
## [61] train-rmse:16.920234+0.456917 test-rmse:39.880380+4.685706
## [81] train-rmse:14.864575+0.250844 test-rmse:39.558290+4.604239
## [101] train-rmse:12.712289+0.493108 test-rmse:39.423447+4.608295
## [121] train-rmse:11.023812+0.791937 test-rmse:39.360087+4.558788
## [141] train-rmse:9.567143+1.039922 test-rmse:39.318630+4.531322
## [161] train-rmse:8.365546+1.182578 test-rmse:39.271873+4.518436
## [181] train-rmse:7.265670+1.222136 test-rmse:39.241595+4.482621
## [201] train-rmse:6.341937+1.073613 test-rmse:39.230275+4.476901
## [221] train-rmse:5.561561+0.959691 test-rmse:39.219937+4.457194
## [241] train-rmse:4.762611+0.818945 test-rmse:39.202938+4.449654
## [261] train-rmse:4.091788+0.707099 test-rmse:39.195238+4.473358
## [281] train-rmse:3.560884+0.594243 test-rmse:39.193683+4.479715
## Stopping. Best iteration:
## [274] train-rmse:3.735931+0.650731 test-rmse:39.188524+4.474571
##
## [1] train-rmse:155.558472+0.991403 test-rmse:155.815540+4.427591
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:29.578846+0.389290 test-rmse:46.412746+5.813459
## [41] train-rmse:12.665625+0.554499 test-rmse:40.139243+4.813538
## [61] train-rmse:9.845133+0.616631 test-rmse:39.647708+4.593992
## [81] train-rmse:8.359111+0.691093 test-rmse:39.550131+4.558801
## [101] train-rmse:7.110555+0.746141 test-rmse:39.536738+4.561098
## [121] train-rmse:6.101495+0.674193 test-rmse:39.517281+4.620117
## [141] train-rmse:5.299083+0.607525 test-rmse:39.511690+4.642082
## [161] train-rmse:4.499875+0.558667 test-rmse:39.518928+4.656730
## Stopping. Best iteration:
## [149] train-rmse:4.975911+0.570371 test-rmse:39.507862+4.658671
##
## [1] train-rmse:155.558251+0.991538 test-rmse:155.815603+4.427637
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:28.786422+0.435636 test-rmse:46.482581+5.811310
## [41] train-rmse:10.001607+0.429646 test-rmse:40.123257+5.070857
## [61] train-rmse:6.587872+0.564184 test-rmse:39.708882+4.818365
## [81] train-rmse:5.136960+0.528998 test-rmse:39.585598+4.739122
## [101] train-rmse:4.076033+0.414160 test-rmse:39.595872+4.751275
## Stopping. Best iteration:
## [84] train-rmse:4.965934+0.536462 test-rmse:39.576262+4.745602
##
## [1] train-rmse:155.923219+0.995006 test-rmse:156.025261+4.444763
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:45.797761+0.761960 test-rmse:49.911694+5.231347
## [41] train-rmse:36.536388+0.892651 test-rmse:43.257002+5.136335
## [61] train-rmse:33.684870+0.873037 test-rmse:41.882848+5.017583
## [81] train-rmse:31.626561+0.639948 test-rmse:41.132896+5.048876
## [101] train-rmse:30.102464+0.550477 test-rmse:40.703755+5.032130
## [121] train-rmse:28.773654+0.513194 test-rmse:40.416854+5.208977
## [141] train-rmse:27.608952+0.564818 test-rmse:40.266063+5.289847
## [161] train-rmse:26.515930+0.602499 test-rmse:40.018540+5.331219
## [181] train-rmse:25.538550+0.568406 test-rmse:39.788404+5.294486
## [201] train-rmse:24.694532+0.614686 test-rmse:39.655583+5.279949
## [221] train-rmse:23.873289+0.565829 test-rmse:39.551840+5.276938
## [241] train-rmse:23.130079+0.502307 test-rmse:39.418876+5.290807
## [261] train-rmse:22.489764+0.425688 test-rmse:39.327120+5.242715
## [281] train-rmse:21.766488+0.417183 test-rmse:39.286420+5.300798
## [301] train-rmse:21.155909+0.355048 test-rmse:39.141280+5.325774
## [321] train-rmse:20.569280+0.358101 test-rmse:39.085246+5.404920
## [341] train-rmse:20.031969+0.340610 test-rmse:39.047630+5.402351
## [361] train-rmse:19.503017+0.306332 test-rmse:38.981530+5.416495
## [381] train-rmse:19.049457+0.298042 test-rmse:38.948432+5.450925
## [400] train-rmse:18.639680+0.270735 test-rmse:38.898228+5.451354
## [1] train-rmse:155.674421+0.996358 test-rmse:155.809973+4.477781
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.827184+0.629267 test-rmse:46.955373+5.258361
## [41] train-rmse:27.880962+0.761878 test-rmse:41.236741+4.684969
## [61] train-rmse:24.312502+0.757889 test-rmse:40.292237+4.651939
## [81] train-rmse:21.900595+0.676296 test-rmse:39.870986+4.697067
## [101] train-rmse:20.002443+0.516896 test-rmse:39.574537+4.751862
## [121] train-rmse:18.297759+0.469001 test-rmse:39.385012+4.779989
## [141] train-rmse:16.841058+0.452940 test-rmse:39.266976+4.855041
## [161] train-rmse:15.677410+0.381546 test-rmse:39.126837+4.844301
## [181] train-rmse:14.565879+0.361111 test-rmse:39.036801+4.842369
## [201] train-rmse:13.592803+0.352575 test-rmse:39.011461+4.862925
## Stopping. Best iteration:
## [190] train-rmse:14.106106+0.336381 test-rmse:38.977904+4.836467
##
## [1] train-rmse:155.583823+0.998974 test-rmse:155.749185+4.490540
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:34.477207+0.576108 test-rmse:46.183847+5.509669
## [41] train-rmse:21.693379+0.733534 test-rmse:40.386987+4.591050
## [61] train-rmse:19.019088+0.591381 test-rmse:39.819377+4.397270
## [81] train-rmse:16.951870+0.618128 test-rmse:39.525553+4.361783
## [101] train-rmse:15.181216+0.767454 test-rmse:39.350090+4.458203
## [121] train-rmse:13.627316+0.773530 test-rmse:39.228005+4.422149
## [141] train-rmse:11.951699+0.790139 test-rmse:39.167303+4.416682
## [161] train-rmse:10.656632+0.941635 test-rmse:39.085551+4.430813
## Stopping. Best iteration:
## [156] train-rmse:10.924023+0.950933 test-rmse:39.069447+4.415372
##
## [1] train-rmse:155.564952+0.994189 test-rmse:155.764182+4.489579
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:31.125694+0.690405 test-rmse:46.230963+5.412474
## [41] train-rmse:15.778394+0.983752 test-rmse:40.739257+4.570697
## [61] train-rmse:13.021589+0.960713 test-rmse:40.336485+4.373620
## [81] train-rmse:11.375103+0.936152 test-rmse:40.208827+4.315567
## [101] train-rmse:9.845662+0.932476 test-rmse:40.192703+4.223233
## [121] train-rmse:8.596861+0.879497 test-rmse:40.155396+4.164202
## [141] train-rmse:7.522202+0.848116 test-rmse:40.161833+4.118405
## Stopping. Best iteration:
## [125] train-rmse:8.348581+0.839370 test-rmse:40.140748+4.172937
##
## [1] train-rmse:155.564731+0.994324 test-rmse:155.764245+4.489624
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:30.294237+0.638722 test-rmse:46.183998+5.610968
## [41] train-rmse:13.006289+0.931179 test-rmse:40.434460+4.693843
## [61] train-rmse:9.633774+0.975980 test-rmse:40.093409+4.456081
## [81] train-rmse:7.725633+0.823563 test-rmse:40.051690+4.355386
## [101] train-rmse:6.285111+0.645862 test-rmse:40.047207+4.267359
## Stopping. Best iteration:
## [93] train-rmse:6.827789+0.706825 test-rmse:40.001798+4.318138
##
## [1] train-rmse:155.923219+0.995006 test-rmse:156.025261+4.444763
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:46.011139+0.818664 test-rmse:49.858349+5.224773
## [41] train-rmse:37.036852+0.856986 test-rmse:42.994131+4.815924
## [61] train-rmse:34.099045+0.753701 test-rmse:41.564434+4.846370
## [81] train-rmse:32.127938+0.628897 test-rmse:41.023944+4.936077
## [101] train-rmse:30.562584+0.518482 test-rmse:40.607200+4.981226
## [121] train-rmse:29.281677+0.459487 test-rmse:40.322825+5.095042
## [141] train-rmse:28.104133+0.477407 test-rmse:40.076025+5.121894
## [161] train-rmse:27.095146+0.519780 test-rmse:39.886026+5.137129
## [181] train-rmse:26.122661+0.539479 test-rmse:39.681715+5.269441
## [201] train-rmse:25.302088+0.461277 test-rmse:39.506076+5.360436
## [221] train-rmse:24.534303+0.497842 test-rmse:39.424825+5.455917
## [241] train-rmse:23.765040+0.494997 test-rmse:39.312331+5.471052
## [261] train-rmse:23.062301+0.486171 test-rmse:39.248629+5.533508
## [281] train-rmse:22.390743+0.447190 test-rmse:39.159901+5.593207
## [301] train-rmse:21.796373+0.444378 test-rmse:39.147849+5.593468
## Stopping. Best iteration:
## [289] train-rmse:22.158002+0.470632 test-rmse:39.121658+5.575967
##
## [1] train-rmse:155.683375+0.997403 test-rmse:155.808269+4.475182
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.635075+0.710631 test-rmse:46.588129+5.385014
## [41] train-rmse:28.944053+0.770717 test-rmse:40.644772+4.840380
## [61] train-rmse:25.444153+0.685129 test-rmse:39.840403+4.863084
## [81] train-rmse:23.125955+0.616136 test-rmse:39.430465+4.956810
## [101] train-rmse:21.428984+0.456083 test-rmse:39.217941+4.977984
## [121] train-rmse:19.770029+0.320270 test-rmse:39.061620+5.030205
## [141] train-rmse:18.446977+0.268956 test-rmse:38.880250+4.986119
## [161] train-rmse:17.173013+0.268071 test-rmse:38.781579+5.053973
## [181] train-rmse:16.026375+0.199319 test-rmse:38.698667+5.017995
## [201] train-rmse:15.061067+0.149424 test-rmse:38.643541+5.005422
## [221] train-rmse:14.185130+0.115322 test-rmse:38.624761+5.014609
## [241] train-rmse:13.328543+0.196200 test-rmse:38.613769+5.041800
## [261] train-rmse:12.551490+0.197290 test-rmse:38.566381+5.011979
## [281] train-rmse:11.757264+0.193628 test-rmse:38.545202+5.038519
## Stopping. Best iteration:
## [269] train-rmse:12.224207+0.214599 test-rmse:38.523666+5.030573
##
## [1] train-rmse:155.593576+0.999524 test-rmse:155.729175+4.482497
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:35.794634+0.761135 test-rmse:45.915867+5.097474
## [41] train-rmse:23.877213+0.931349 test-rmse:40.332650+4.588628
## [61] train-rmse:21.134434+0.875543 test-rmse:39.760550+4.340987
## [81] train-rmse:18.985846+0.889907 test-rmse:39.503112+4.297516
## [101] train-rmse:17.274876+0.846141 test-rmse:39.278254+4.298143
## [121] train-rmse:15.725528+0.527950 test-rmse:39.183504+4.324950
## [141] train-rmse:14.338271+0.409428 test-rmse:39.108741+4.320694
## [161] train-rmse:13.119541+0.505128 test-rmse:39.003558+4.281511
## [181] train-rmse:11.939009+0.679259 test-rmse:38.928333+4.271268
## [201] train-rmse:10.968982+0.891851 test-rmse:38.919110+4.279946
## [221] train-rmse:10.028509+1.052103 test-rmse:38.872795+4.275636
## [241] train-rmse:9.167648+1.184751 test-rmse:38.854964+4.232894
## [261] train-rmse:8.271626+1.240259 test-rmse:38.818084+4.218643
## [281] train-rmse:7.484287+1.278640 test-rmse:38.812453+4.197328
## Stopping. Best iteration:
## [266] train-rmse:8.066380+1.224079 test-rmse:38.800851+4.213550
##
## [1] train-rmse:155.575698+0.994179 test-rmse:155.743789+4.479039
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:32.917626+0.654092 test-rmse:45.382986+5.229377
## [41] train-rmse:18.935301+0.993680 test-rmse:39.609924+4.670923
## [61] train-rmse:16.016446+0.942589 test-rmse:39.217724+4.358806
## [81] train-rmse:14.261695+0.907734 test-rmse:39.165765+4.275564
## [101] train-rmse:12.749838+0.964577 test-rmse:39.064697+4.259235
## [121] train-rmse:11.454316+0.970915 test-rmse:39.032333+4.256497
## [141] train-rmse:10.377982+0.909442 test-rmse:38.986465+4.251055
## [161] train-rmse:9.379088+0.895431 test-rmse:38.962029+4.235906
## Stopping. Best iteration:
## [156] train-rmse:9.598302+0.910826 test-rmse:38.942701+4.250798
##
## [1] train-rmse:155.575477+0.994313 test-rmse:155.743852+4.479084
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:32.184717+0.555761 test-rmse:45.327270+5.300231
## [41] train-rmse:16.558327+0.930376 test-rmse:39.403841+4.823431
## [61] train-rmse:13.020733+0.827251 test-rmse:39.073049+4.581892
## Stopping. Best iteration:
## [58] train-rmse:13.408920+0.805291 test-rmse:39.049898+4.605043
##
## [1] train-rmse:155.928510+0.994099 test-rmse:156.012588+4.424331
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:46.394729+0.785737 test-rmse:50.010573+5.141373
## [41] train-rmse:37.477197+0.945086 test-rmse:43.292249+4.856590
## [61] train-rmse:34.697950+0.926048 test-rmse:41.819547+4.791283
## [81] train-rmse:32.824039+0.817514 test-rmse:41.095205+4.782919
## [101] train-rmse:31.405380+0.766240 test-rmse:40.706208+4.889656
## [121] train-rmse:30.151174+0.770466 test-rmse:40.505194+4.817776
## [141] train-rmse:29.031136+0.764227 test-rmse:40.209747+4.852588
## [161] train-rmse:28.033676+0.700861 test-rmse:39.980434+4.900808
## [181] train-rmse:27.047187+0.683060 test-rmse:39.805408+4.928373
## [201] train-rmse:26.251450+0.564662 test-rmse:39.677288+4.946854
## [221] train-rmse:25.482954+0.521290 test-rmse:39.528052+5.015162
## [241] train-rmse:24.788911+0.468435 test-rmse:39.454241+5.069034
## [261] train-rmse:24.076803+0.440437 test-rmse:39.385713+5.120202
## [281] train-rmse:23.431674+0.434120 test-rmse:39.303576+5.173130
## [301] train-rmse:22.851691+0.457692 test-rmse:39.247754+5.208020
## [321] train-rmse:22.310668+0.460469 test-rmse:39.209739+5.223890
## [341] train-rmse:21.799608+0.431278 test-rmse:39.144663+5.255303
## [361] train-rmse:21.304357+0.378111 test-rmse:39.171870+5.388337
## Stopping. Best iteration:
## [355] train-rmse:21.441731+0.398412 test-rmse:39.140591+5.372818
##
## [1] train-rmse:155.693463+0.994821 test-rmse:155.809231+4.455935
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:40.799635+1.036890 test-rmse:47.223533+5.156364
## [41] train-rmse:31.015806+0.948111 test-rmse:41.434288+4.388744
## [61] train-rmse:27.867882+0.804261 test-rmse:40.414331+4.274944
## [81] train-rmse:25.584146+0.895710 test-rmse:40.020300+4.312466
## [101] train-rmse:23.646398+0.774060 test-rmse:39.772485+4.376203
## [121] train-rmse:22.095201+0.788582 test-rmse:39.558129+4.372716
## [141] train-rmse:20.546204+0.795319 test-rmse:39.469596+4.403057
## [161] train-rmse:19.395617+0.713909 test-rmse:39.407533+4.436031
## [181] train-rmse:18.193837+0.636075 test-rmse:39.184372+4.419624
## [201] train-rmse:17.003789+0.514772 test-rmse:39.080524+4.463708
## [221] train-rmse:16.070674+0.437204 test-rmse:39.004393+4.541454
## [241] train-rmse:15.166980+0.458443 test-rmse:38.925051+4.564741
## [261] train-rmse:14.310488+0.364218 test-rmse:38.889798+4.530142
## [281] train-rmse:13.546584+0.359430 test-rmse:38.824347+4.498698
## [301] train-rmse:12.819510+0.302203 test-rmse:38.774241+4.506507
## [321] train-rmse:12.195240+0.250993 test-rmse:38.763024+4.535178
## [341] train-rmse:11.585869+0.238547 test-rmse:38.728615+4.543407
## [361] train-rmse:10.980963+0.215037 test-rmse:38.685963+4.583869
## [381] train-rmse:10.454953+0.200367 test-rmse:38.682394+4.577352
## Stopping. Best iteration:
## [376] train-rmse:10.581299+0.205868 test-rmse:38.672607+4.577582
##
## [1] train-rmse:155.605220+0.997586 test-rmse:155.734620+4.462215
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:37.398383+0.910310 test-rmse:46.030590+5.122129
## [41] train-rmse:26.451049+0.998985 test-rmse:40.119435+4.374975
## [61] train-rmse:23.655319+0.942740 test-rmse:39.606841+4.309743
## [81] train-rmse:21.649974+0.944753 test-rmse:39.486674+4.357037
## [101] train-rmse:19.941541+1.037673 test-rmse:39.312663+4.383887
## [121] train-rmse:18.446103+1.001197 test-rmse:39.190934+4.376996
## [141] train-rmse:17.046410+0.957040 test-rmse:39.159902+4.340655
## [161] train-rmse:15.823516+0.886960 test-rmse:39.160123+4.261785
## [181] train-rmse:14.640353+0.778228 test-rmse:39.098718+4.180354
## [201] train-rmse:13.517885+0.700988 test-rmse:39.117681+4.157102
## Stopping. Best iteration:
## [189] train-rmse:14.205256+0.852796 test-rmse:39.080676+4.144212
##
## [1] train-rmse:155.586684+0.993157 test-rmse:155.745685+4.457457
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:35.036128+0.954248 test-rmse:45.883272+5.086294
## [41] train-rmse:22.448311+1.287448 test-rmse:39.840053+4.398604
## [61] train-rmse:19.541090+1.340292 test-rmse:39.408062+4.206031
## [81] train-rmse:17.510792+1.302728 test-rmse:39.437521+4.101296
## Stopping. Best iteration:
## [63] train-rmse:19.307116+1.307286 test-rmse:39.389669+4.190030
##
## [1] train-rmse:155.586684+0.993157 test-rmse:155.745685+4.457457
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:34.443311+0.887727 test-rmse:45.755905+5.204352
## [41] train-rmse:20.369186+1.240301 test-rmse:39.664915+4.679408
## [61] train-rmse:16.754441+1.255015 test-rmse:39.297363+4.466601
## [81] train-rmse:14.457300+1.230454 test-rmse:39.257343+4.394194
## Stopping. Best iteration:
## [74] train-rmse:15.176265+1.255657 test-rmse:39.239298+4.377807
res_db <- cbind.data.frame(cv_params, rmse_vec)
names(res_db)[3] <- c("rmse")
res_db$max_depth <- as.factor(res_db$max_depth) # Convert tree number to factor for plotting
res_db$min_child_weight <- as.factor(res_db$min_child_weight) # Convert node size to factor for plotting
# Print AUC heatmap
g_2 <- ggplot(res_db, aes(y = max_depth, x = min_child_weight, fill = rmse)) + # set aesthetics
geom_tile() + # Use geom_tile for heatmap
theme_bw() + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =mean(res_db$rmse), # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
labs(x = "Minimum Child Weight", y = "Max Depth", fill = "RMSE") # Set labels
g_2 # Generate plot
Interpretation This graph is a hyperparameter tuning
heatmap for a machine learning model, and it helps identify the best
combination of two key parameters—Maximum Depth and Minimum Child
Weight—to minimize the RMSE (Root Mean Square Error), which measures
prediction error.
Performance Regions (Color-coded RMSE):
Blue/Purple Areas: Indicate lower RMSE values, meaning better model performance. Red Areas: Indicate higher RMSE values, meaning worse performance. The goal is to find the darkest blue region, as this represents the optimal hyperparameter combination.
Optimal Region: 5 and 5
The dark blue area in the heatmap highlights the best-performing combination of Max Depth and Minimum Child Weight, where RMSE is lowest. This region is the sweet spot for balancing underfitting and overfitting.
This graph helps us select the optimal parameter values by visually pinpointing the area with the lowest RMSE (dark blue).
res_db[which.min(res_db$rmse),]
## max_depth min_child_weight rmse
## 12 5 5 37.9105
gamma_vals <- c(0, 0.05, 0.1, 0.15, 0.2) # Create vector of gamma values
# Be Careful - This can take a very long time to run
set.seed(111111)
rmse_vec <- rep(NA, length(gamma_vals))
for(i in 1:length(gamma_vals)){
bst_tune <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = gamma_vals[i], # Set minimum loss reduction for split
nrounds = 100, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
rmse_vec[i] <- bst_tune$evaluation_log$test_rmse_mean[bst_tune$best_ntreelimit]
}
## [1] train-rmse:155.669957+0.992305 test-rmse:155.865894+4.419190
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.156101+0.437160 test-rmse:47.236984+5.563532
## [41] train-rmse:26.675235+0.447150 test-rmse:40.709864+5.166045
## [61] train-rmse:23.194599+0.381905 test-rmse:39.677992+5.215275
## [81] train-rmse:20.757428+0.418509 test-rmse:39.128852+5.300222
## [100] train-rmse:18.921969+0.471757 test-rmse:38.798153+5.487136
## [1] train-rmse:155.624139+0.855612 test-rmse:155.946478+3.867198
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:37.873850+0.305590 test-rmse:48.366548+3.782038
## [41] train-rmse:26.462077+0.311419 test-rmse:42.623345+2.699476
## [61] train-rmse:22.999550+0.357038 test-rmse:42.000176+2.679805
## [81] train-rmse:20.721396+0.347934 test-rmse:41.755204+2.666869
## [100] train-rmse:18.928444+0.218683 test-rmse:41.536280+2.716259
## [1] train-rmse:155.642011+0.757829 test-rmse:155.795694+3.283537
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:37.954876+0.326507 test-rmse:48.271183+4.289049
## [41] train-rmse:26.480096+0.799600 test-rmse:42.900550+3.902017
## [61] train-rmse:22.938072+0.717120 test-rmse:42.290639+3.982604
## [81] train-rmse:20.452240+0.477754 test-rmse:41.903557+4.180795
## [100] train-rmse:18.774209+0.367113 test-rmse:41.789470+4.292675
## [1] train-rmse:155.635936+0.941595 test-rmse:155.836554+4.065533
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.077843+0.477146 test-rmse:47.794288+4.627716
## [41] train-rmse:26.883747+0.547061 test-rmse:42.321650+4.922131
## [61] train-rmse:23.237187+0.663856 test-rmse:41.752059+5.094904
## [81] train-rmse:20.774524+0.519789 test-rmse:41.451020+5.218075
## [100] train-rmse:19.097906+0.412005 test-rmse:41.381768+5.361733
## [1] train-rmse:155.649032+0.896118 test-rmse:155.825481+3.924286
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.032362+0.535894 test-rmse:49.286317+2.789323
## [41] train-rmse:26.416388+0.650058 test-rmse:43.439172+2.282346
## [61] train-rmse:22.782243+0.519330 test-rmse:42.562767+2.655483
## [81] train-rmse:20.432549+0.373585 test-rmse:42.167702+2.850239
## [100] train-rmse:18.791343+0.320954 test-rmse:41.961346+2.985648
# Lets view our results to identify the value of gamma to use:
# Gamma results
# Join gamma to values
cbind.data.frame(gamma_vals, rmse_vec)
## gamma_vals rmse_vec
## 1 0.00 38.79815
## 2 0.05 41.53384
## 3 0.10 41.78238
## 4 0.15 41.36386
## 5 0.20 41.96135
subsample <- c(0.6, 0.7, 0.8, 0.9, 1) # Create vector of subsample values
colsample_by_tree <- c(0.6, 0.7, 0.8, 0.9, 1) # Create vector of col sample values
# Expand grid of tuning parameters
cv_params <- expand.grid(subsample, colsample_by_tree)
names(cv_params) <- c("subsample", "colsample_by_tree")
# Create vectors to store results
rmse_vec <- rep(NA, nrow(cv_params))
# Loop through parameter values
for(i in 1:nrow(cv_params)){
set.seed(111111)
bst_tune <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = cv_params$subsample[i], # Set proportion of training data to use in tree
colsample_bytree = cv_params$colsample_by_tree[i], # Set number of variables to use in each tree
nrounds = 150, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
rmse_vec[i] <- bst_tune$evaluation_log$test_rmse_mean[bst_tune$best_ntreelimit]
}
## [1] train-rmse:156.291580+1.134424 test-rmse:156.380547+4.490223
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:41.625678+1.188006 test-rmse:48.723618+5.732277
## [41] train-rmse:30.270035+1.095867 test-rmse:41.603933+5.040977
## [61] train-rmse:27.145651+0.948699 test-rmse:40.860320+5.189090
## [81] train-rmse:24.942090+1.047179 test-rmse:40.506981+5.328928
## [101] train-rmse:23.122362+1.036449 test-rmse:39.998408+5.231731
## [121] train-rmse:21.547796+0.992947 test-rmse:39.864006+5.410231
## [141] train-rmse:20.212245+0.947153 test-rmse:39.607059+5.270484
## [150] train-rmse:19.580229+0.916530 test-rmse:39.486594+5.215443
## [1] train-rmse:156.180297+1.180874 test-rmse:156.274783+4.270194
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:41.012587+1.402185 test-rmse:48.392872+5.573266
## [41] train-rmse:29.301820+1.389661 test-rmse:41.471292+4.935322
## [61] train-rmse:25.998738+1.070476 test-rmse:40.474000+4.978077
## [81] train-rmse:23.563668+0.989414 test-rmse:39.918954+5.126147
## [101] train-rmse:21.542258+0.860675 test-rmse:39.502628+5.085090
## [121] train-rmse:19.979315+0.930820 test-rmse:39.261521+5.160797
## [141] train-rmse:18.515159+0.954365 test-rmse:39.240929+5.233198
## [150] train-rmse:17.905418+0.878707 test-rmse:39.146248+5.217605
## [1] train-rmse:156.113181+1.114995 test-rmse:156.289350+4.380318
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:40.255881+0.998045 test-rmse:48.610496+5.957629
## [41] train-rmse:28.220078+0.691427 test-rmse:41.577854+5.603400
## [61] train-rmse:24.796127+0.746467 test-rmse:40.533496+5.588876
## [81] train-rmse:22.328533+0.553579 test-rmse:39.979970+5.690993
## [101] train-rmse:20.340265+0.631716 test-rmse:39.661534+5.763079
## [121] train-rmse:18.716275+0.593216 test-rmse:39.475244+5.796814
## [141] train-rmse:17.324511+0.551708 test-rmse:39.302907+5.797290
## [150] train-rmse:16.684074+0.573041 test-rmse:39.206503+5.758951
## [1] train-rmse:156.064116+1.120800 test-rmse:156.141441+4.395624
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.809305+0.801021 test-rmse:48.330223+5.981505
## [41] train-rmse:27.814969+0.621679 test-rmse:41.103771+5.335469
## [61] train-rmse:24.113012+0.407792 test-rmse:40.047294+5.202767
## [81] train-rmse:21.583854+0.397496 test-rmse:39.472346+5.133864
## [101] train-rmse:19.596170+0.287657 test-rmse:39.077661+5.149512
## [121] train-rmse:18.001721+0.375514 test-rmse:38.922964+5.132934
## [141] train-rmse:16.584435+0.262342 test-rmse:38.761365+5.144768
## [150] train-rmse:16.030579+0.264398 test-rmse:38.687578+5.177202
## [1] train-rmse:155.849898+0.907883 test-rmse:156.125241+4.589829
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.294708+0.481438 test-rmse:48.084461+6.134848
## [41] train-rmse:26.975843+0.568916 test-rmse:40.918388+5.612634
## [61] train-rmse:23.202392+0.651144 test-rmse:39.989423+5.456797
## [81] train-rmse:20.810667+0.668823 test-rmse:39.560855+5.443846
## [101] train-rmse:19.026590+0.524716 test-rmse:39.203812+5.517627
## [121] train-rmse:17.462076+0.460771 test-rmse:39.022355+5.510305
## [141] train-rmse:16.184580+0.391956 test-rmse:38.897916+5.518509
## [150] train-rmse:15.646761+0.361256 test-rmse:38.830695+5.509801
## [1] train-rmse:156.177440+1.112329 test-rmse:156.214763+4.419398
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:41.054854+1.035014 test-rmse:47.900869+6.040799
## [41] train-rmse:30.065247+1.029730 test-rmse:40.943314+5.606810
## [61] train-rmse:27.261584+0.895330 test-rmse:40.268873+5.609198
## [81] train-rmse:25.164523+0.950737 test-rmse:39.846401+5.531536
## [101] train-rmse:23.230802+0.973376 test-rmse:39.489493+5.396859
## [121] train-rmse:21.636450+1.037630 test-rmse:39.223737+5.469100
## [141] train-rmse:20.179279+1.087043 test-rmse:39.010269+5.606112
## [150] train-rmse:19.560585+1.097386 test-rmse:38.983560+5.518761
## [1] train-rmse:156.092013+1.191476 test-rmse:156.256243+4.235904
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:40.458992+0.856766 test-rmse:47.613359+5.793521
## [41] train-rmse:29.188384+1.057993 test-rmse:40.931213+5.118927
## [61] train-rmse:25.902044+0.783975 test-rmse:40.164865+5.208996
## [81] train-rmse:23.637899+0.746726 test-rmse:39.693487+5.112769
## [101] train-rmse:21.705293+0.827438 test-rmse:39.451026+5.223210
## [121] train-rmse:19.855097+0.878348 test-rmse:39.192646+5.330260
## [141] train-rmse:18.400175+0.812008 test-rmse:39.057665+5.298968
## [150] train-rmse:17.671689+0.778241 test-rmse:38.998893+5.362957
## [1] train-rmse:155.991359+1.127262 test-rmse:156.161536+4.473588
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.740164+0.559403 test-rmse:47.401289+5.868959
## [41] train-rmse:28.078753+0.581502 test-rmse:40.831795+5.029958
## [61] train-rmse:24.642379+0.728251 test-rmse:39.933057+4.987196
## [81] train-rmse:22.064594+0.652155 test-rmse:39.403365+5.130609
## [101] train-rmse:20.085370+0.495443 test-rmse:38.861509+4.989941
## [121] train-rmse:18.420895+0.494583 test-rmse:38.711803+4.985875
## [141] train-rmse:16.951364+0.395819 test-rmse:38.517433+5.056847
## [150] train-rmse:16.297066+0.366139 test-rmse:38.510253+5.092911
## [1] train-rmse:155.985737+1.145537 test-rmse:156.288793+4.357654
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.184214+0.469826 test-rmse:47.523167+5.813609
## [41] train-rmse:27.399645+0.511137 test-rmse:40.624388+4.960100
## [61] train-rmse:23.989185+0.448009 test-rmse:39.629873+4.862628
## [81] train-rmse:21.449382+0.361444 test-rmse:39.167148+4.944653
## [101] train-rmse:19.439899+0.293436 test-rmse:38.829178+4.943255
## [121] train-rmse:17.761871+0.343214 test-rmse:38.545187+4.963868
## [141] train-rmse:16.384352+0.291314 test-rmse:38.395175+4.935563
## [150] train-rmse:15.782046+0.315989 test-rmse:38.322918+4.934728
## [1] train-rmse:155.839298+0.898715 test-rmse:156.149274+4.656773
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.547628+0.508129 test-rmse:47.447583+5.922893
## [41] train-rmse:26.764550+0.710443 test-rmse:40.719245+5.115681
## [61] train-rmse:23.378775+0.770796 test-rmse:39.808524+4.980537
## [81] train-rmse:20.985002+0.756028 test-rmse:39.270668+4.985628
## [101] train-rmse:19.224153+0.813055 test-rmse:39.028684+5.054643
## [121] train-rmse:17.533188+0.665563 test-rmse:38.812500+5.106983
## [141] train-rmse:16.140011+0.633244 test-rmse:38.711704+5.129082
## [150] train-rmse:15.569447+0.565968 test-rmse:38.648896+5.136756
## [1] train-rmse:156.178377+1.113106 test-rmse:156.229209+4.422549
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:40.891827+0.901844 test-rmse:47.167286+5.151926
## [41] train-rmse:30.142583+0.996419 test-rmse:40.656223+4.759258
## [61] train-rmse:27.224142+0.900427 test-rmse:39.787144+4.770118
## [81] train-rmse:25.130520+0.963399 test-rmse:39.321836+4.593772
## [101] train-rmse:23.454367+0.947753 test-rmse:38.948115+4.633573
## [121] train-rmse:21.904743+0.925526 test-rmse:38.842512+4.845019
## [141] train-rmse:20.470661+0.865073 test-rmse:38.695299+4.904461
## [150] train-rmse:19.817300+0.884533 test-rmse:38.659919+4.822291
## [1] train-rmse:156.084983+1.219448 test-rmse:156.195661+4.311254
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:40.121248+0.825825 test-rmse:47.451767+5.410110
## [41] train-rmse:28.997394+0.921959 test-rmse:41.017012+4.925160
## [61] train-rmse:26.018578+0.654137 test-rmse:40.160959+4.850924
## [81] train-rmse:23.776847+0.798661 test-rmse:39.741252+4.830851
## [101] train-rmse:21.792616+0.681667 test-rmse:39.394424+4.926326
## [121] train-rmse:20.037946+0.694976 test-rmse:39.116832+5.036429
## [141] train-rmse:18.411468+0.632667 test-rmse:38.959986+5.159385
## [150] train-rmse:17.723150+0.594442 test-rmse:38.848474+5.277316
## [1] train-rmse:156.015442+1.174104 test-rmse:156.156272+4.425099
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.564629+0.528510 test-rmse:47.014312+6.059164
## [41] train-rmse:28.031450+0.525119 test-rmse:40.837430+5.165862
## [61] train-rmse:24.826333+0.625236 test-rmse:40.052544+5.071428
## [81] train-rmse:22.184593+0.639494 test-rmse:39.506227+5.124452
## [101] train-rmse:20.163674+0.609068 test-rmse:39.181205+5.056354
## [121] train-rmse:18.311667+0.599425 test-rmse:39.000186+5.140381
## [141] train-rmse:16.837760+0.493834 test-rmse:38.780511+5.196465
## [150] train-rmse:16.139689+0.498764 test-rmse:38.742693+5.218155
## [1] train-rmse:155.982942+1.143467 test-rmse:156.301868+4.333073
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.085432+0.596514 test-rmse:47.650925+5.543978
## [41] train-rmse:27.392445+0.440213 test-rmse:40.782148+4.576309
## [61] train-rmse:23.765727+0.551476 test-rmse:39.647850+4.516121
## [81] train-rmse:21.401628+0.400158 test-rmse:39.068174+4.456198
## [101] train-rmse:19.462099+0.412259 test-rmse:38.674099+4.456616
## [121] train-rmse:17.825618+0.313008 test-rmse:38.417274+4.527150
## [141] train-rmse:16.323121+0.289241 test-rmse:38.288630+4.547847
## [150] train-rmse:15.787460+0.324964 test-rmse:38.247348+4.530877
## [1] train-rmse:155.839187+0.898436 test-rmse:156.218077+4.704648
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.311036+0.450218 test-rmse:47.340778+5.850827
## [41] train-rmse:26.714710+0.468377 test-rmse:40.870520+5.156912
## [61] train-rmse:23.111174+0.647136 test-rmse:39.863700+5.036713
## [81] train-rmse:20.751335+0.461089 test-rmse:39.332642+4.962572
## [101] train-rmse:19.014339+0.467169 test-rmse:39.063752+4.995765
## [121] train-rmse:17.434822+0.413803 test-rmse:38.886996+4.943162
## [141] train-rmse:16.124520+0.334463 test-rmse:38.755606+4.935025
## [150] train-rmse:15.524068+0.272693 test-rmse:38.694992+4.917269
## [1] train-rmse:156.112079+1.131794 test-rmse:156.231783+4.428030
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:40.631075+0.979470 test-rmse:47.184586+5.025250
## [41] train-rmse:30.224822+1.194308 test-rmse:40.960456+4.533442
## [61] train-rmse:27.373110+0.947946 test-rmse:40.264137+4.555948
## [81] train-rmse:25.442543+0.957456 test-rmse:39.942953+4.446183
## [101] train-rmse:23.538515+0.927151 test-rmse:39.874912+4.375549
## [121] train-rmse:21.869966+1.077127 test-rmse:39.581849+4.362541
## [141] train-rmse:20.314956+1.009889 test-rmse:39.433813+4.321348
## [150] train-rmse:19.621685+0.964872 test-rmse:39.326409+4.321024
## [1] train-rmse:156.050601+1.214178 test-rmse:156.226310+4.438342
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.909838+0.913504 test-rmse:46.851218+4.986304
## [41] train-rmse:28.961209+1.008499 test-rmse:40.436367+4.440323
## [61] train-rmse:25.856636+0.940638 test-rmse:39.512138+4.439425
## [81] train-rmse:23.720859+0.981385 test-rmse:38.942781+4.457746
## [101] train-rmse:21.771561+0.950106 test-rmse:38.581884+4.461963
## [121] train-rmse:19.895905+0.852775 test-rmse:38.353050+4.547435
## [141] train-rmse:18.254287+0.798754 test-rmse:38.227143+4.555186
## [150] train-rmse:17.573858+0.804880 test-rmse:38.083244+4.543580
## [1] train-rmse:156.007363+1.175878 test-rmse:156.118958+4.476477
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.181850+0.485702 test-rmse:46.755726+5.444674
## [41] train-rmse:28.059356+0.709385 test-rmse:40.453957+4.701023
## [61] train-rmse:24.701509+0.745351 test-rmse:39.701412+4.779918
## [81] train-rmse:22.211652+0.881894 test-rmse:39.225590+4.790598
## [101] train-rmse:20.231272+0.886198 test-rmse:38.920089+4.825687
## [121] train-rmse:18.453885+0.767870 test-rmse:38.848609+4.881352
## [141] train-rmse:16.871992+0.656660 test-rmse:38.603432+4.941519
## [150] train-rmse:16.247109+0.679694 test-rmse:38.533410+4.963071
## [1] train-rmse:155.991614+1.124071 test-rmse:156.202968+4.372182
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.700786+0.431380 test-rmse:46.862356+5.359729
## [41] train-rmse:27.259100+0.686857 test-rmse:40.894935+4.801975
## [61] train-rmse:23.899220+0.753958 test-rmse:40.115942+4.944050
## [81] train-rmse:21.319265+0.612384 test-rmse:39.702955+5.077506
## [101] train-rmse:19.359289+0.577397 test-rmse:39.379893+5.167841
## [121] train-rmse:17.710481+0.550666 test-rmse:39.122219+5.160412
## [141] train-rmse:16.207076+0.517892 test-rmse:39.010788+5.162200
## [150] train-rmse:15.606470+0.454306 test-rmse:38.975855+5.187639
## [1] train-rmse:155.756163+0.955411 test-rmse:155.939527+4.474408
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.218010+0.607671 test-rmse:46.845340+5.411159
## [41] train-rmse:26.649823+0.476835 test-rmse:40.544062+4.546405
## [61] train-rmse:22.965013+0.470372 test-rmse:39.404912+4.596457
## [81] train-rmse:20.690756+0.480757 test-rmse:38.899079+4.532180
## [101] train-rmse:18.801046+0.280414 test-rmse:38.653309+4.571344
## [121] train-rmse:17.275738+0.188771 test-rmse:38.495048+4.578351
## [141] train-rmse:15.949040+0.168770 test-rmse:38.458078+4.598557
## [150] train-rmse:15.444665+0.153808 test-rmse:38.424424+4.622152
## [1] train-rmse:155.716033+1.041207 test-rmse:155.878949+4.240293
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.980271+0.899031 test-rmse:46.995177+4.864283
## [41] train-rmse:29.881118+0.830316 test-rmse:40.672883+4.211619
## [61] train-rmse:26.866022+0.816013 test-rmse:39.893607+4.319214
## [81] train-rmse:24.849374+0.758049 test-rmse:39.353926+4.250231
## [101] train-rmse:23.086081+0.786834 test-rmse:39.135953+4.301608
## [121] train-rmse:21.586225+0.810182 test-rmse:39.039988+4.171739
## [141] train-rmse:19.943191+0.835024 test-rmse:38.910651+4.135550
## [150] train-rmse:19.284659+0.807842 test-rmse:38.773888+4.138759
## [1] train-rmse:155.697139+1.044981 test-rmse:155.894196+4.293842
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.361181+0.618975 test-rmse:47.148422+4.887731
## [41] train-rmse:28.838144+0.951358 test-rmse:40.925137+3.766718
## [61] train-rmse:25.810585+0.809332 test-rmse:40.142945+3.624474
## [81] train-rmse:23.619256+0.848361 test-rmse:39.670041+3.679982
## [101] train-rmse:21.644541+0.893331 test-rmse:39.352504+3.763876
## [121] train-rmse:19.943548+0.910673 test-rmse:39.103856+3.683345
## [141] train-rmse:18.298481+0.858152 test-rmse:38.925187+3.697270
## [150] train-rmse:17.594802+0.846605 test-rmse:38.893378+3.760133
## [1] train-rmse:155.666892+0.993491 test-rmse:155.972399+4.366537
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.726839+0.755645 test-rmse:47.002576+4.969829
## [41] train-rmse:27.874082+0.944789 test-rmse:40.804194+3.990876
## [61] train-rmse:24.663644+0.886228 test-rmse:40.059195+4.004015
## [81] train-rmse:22.272936+0.958236 test-rmse:39.640069+4.101683
## [101] train-rmse:20.204768+0.882501 test-rmse:39.233083+4.253777
## [121] train-rmse:18.396895+0.864541 test-rmse:39.105024+4.244420
## [141] train-rmse:16.822424+0.756017 test-rmse:38.893167+4.292641
## [150] train-rmse:16.129316+0.713102 test-rmse:38.830747+4.262617
## [1] train-rmse:155.643642+0.991466 test-rmse:155.784404+4.409067
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.472527+0.543640 test-rmse:46.960939+5.401652
## [41] train-rmse:27.165821+0.655782 test-rmse:40.899146+4.883581
## [61] train-rmse:23.517065+0.761286 test-rmse:39.929740+4.944964
## [81] train-rmse:20.977950+0.772185 test-rmse:39.359333+5.086245
## [101] train-rmse:18.999433+0.561110 test-rmse:39.065539+5.207791
## [121] train-rmse:17.399469+0.539789 test-rmse:38.940123+5.339607
## [141] train-rmse:15.882419+0.519547 test-rmse:38.813482+5.340270
## [150] train-rmse:15.278418+0.426091 test-rmse:38.745651+5.310615
## [1] train-rmse:155.669957+0.992305 test-rmse:155.865894+4.419190
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:38.156101+0.437160 test-rmse:47.236984+5.563532
## [41] train-rmse:26.675235+0.447150 test-rmse:40.709864+5.166045
## [61] train-rmse:23.194599+0.381905 test-rmse:39.677992+5.215275
## [81] train-rmse:20.757428+0.418509 test-rmse:39.128852+5.300222
## [101] train-rmse:18.857263+0.465870 test-rmse:38.795034+5.482592
## [121] train-rmse:17.142871+0.385091 test-rmse:38.575392+5.474923
## [141] train-rmse:15.648157+0.414026 test-rmse:38.411545+5.562134
## [150] train-rmse:15.049870+0.365411 test-rmse:38.323501+5.579656
# visualise tuning sample params
res_db <- cbind.data.frame(cv_params, rmse_vec)
names(res_db)[3] <- c("rmse")
res_db$subsample <- as.factor(res_db$subsample) # Convert tree number to factor for plotting
res_db$colsample_by_tree <- as.factor(res_db$colsample_by_tree) # Convert node size to factor for plotting
g_4 <- ggplot(res_db, aes(y = colsample_by_tree, x = subsample, fill = rmse)) + # set aesthetics
geom_tile() + # Use geom_tile for heatmap
theme_bw() + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =mean(res_db$rmse), # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
labs(x = "Subsample", y = "Column Sample by Tree", fill = "RMSE") # Set labels
g_4 # Generate plot
optimal for subsample column 0.9 and 0.7
###### 4 - eta tuning ######
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
bst_mod_1 <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.3, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = 0.7, # Set proportion of training data to use in tree
colsample_bytree = 0.9, # Set number of variables to use in each tree
nrounds = 1000, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:126.221429+1.726339 test-rmse:126.967395+4.729086
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:27.281582+0.886040 test-rmse:43.327927+4.965869
## [41] train-rmse:21.582590+1.297152 test-rmse:42.803436+4.791390
## [61] train-rmse:17.213924+1.095050 test-rmse:42.769312+4.934886
## Stopping. Best iteration:
## [57] train-rmse:18.144053+1.170634 test-rmse:42.691267+4.772975
set.seed(111111)
bst_mod_2 <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.1, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = 0.7, # Set proportion of training data to use in tree
colsample_bytree = 0.9, # Set number of variables to use in each tree
nrounds = 1000, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:156.050601+1.214178 test-rmse:156.226310+4.438342
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:39.909838+0.913504 test-rmse:46.851218+4.986304
## [41] train-rmse:28.961209+1.008499 test-rmse:40.436367+4.440323
## [61] train-rmse:25.856636+0.940638 test-rmse:39.512138+4.439425
## [81] train-rmse:23.720859+0.981385 test-rmse:38.942781+4.457746
## [101] train-rmse:21.771561+0.950106 test-rmse:38.581884+4.461963
## [121] train-rmse:19.895905+0.852775 test-rmse:38.353050+4.547435
## [141] train-rmse:18.254287+0.798754 test-rmse:38.227143+4.555186
## [161] train-rmse:16.863762+0.765064 test-rmse:38.027892+4.590511
## [181] train-rmse:15.539537+0.755156 test-rmse:38.066572+4.606022
## Stopping. Best iteration:
## [162] train-rmse:16.776628+0.782759 test-rmse:38.026948+4.578883
set.seed(111111)
bst_mod_3 <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.05, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = 0.7, # Set proportion of training data to use in tree
colsample_bytree = 0.9, # Set number of variables to use in each tree
nrounds = 1000, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:163.645596+1.136107 test-rmse:163.705952+4.384028
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:70.946041+0.724285 test-rmse:73.543190+5.349142
## [41] train-rmse:40.944173+0.714031 test-rmse:47.714242+5.109761
## [61] train-rmse:32.036698+0.718619 test-rmse:41.870618+4.629976
## [81] train-rmse:28.750940+0.887176 test-rmse:40.332021+4.398836
## [101] train-rmse:26.989661+0.793043 test-rmse:39.685025+4.434232
## [121] train-rmse:25.746344+0.769723 test-rmse:39.417688+4.392644
## [141] train-rmse:24.574854+0.811988 test-rmse:39.228437+4.450826
## [161] train-rmse:23.496198+0.933040 test-rmse:39.068872+4.509305
## [181] train-rmse:22.508068+0.949237 test-rmse:38.852319+4.465521
## [201] train-rmse:21.652438+0.972101 test-rmse:38.766980+4.519673
## [221] train-rmse:20.799272+0.990975 test-rmse:38.647060+4.491595
## [241] train-rmse:19.956687+0.933110 test-rmse:38.505983+4.577817
## [261] train-rmse:19.160625+0.915361 test-rmse:38.402323+4.540959
## [281] train-rmse:18.377205+0.831631 test-rmse:38.341734+4.560897
## [301] train-rmse:17.658087+0.814335 test-rmse:38.276093+4.609224
## [321] train-rmse:16.948371+0.800184 test-rmse:38.212106+4.634375
## [341] train-rmse:16.261165+0.763589 test-rmse:38.142769+4.614524
## [361] train-rmse:15.584701+0.693656 test-rmse:38.084905+4.612309
## [381] train-rmse:14.949266+0.680382 test-rmse:38.048500+4.592930
## [401] train-rmse:14.365648+0.654907 test-rmse:38.054611+4.601365
## Stopping. Best iteration:
## [385] train-rmse:14.825310+0.681590 test-rmse:38.028778+4.596208
set.seed(111111)
bst_mod_4 <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.01, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = 0.7, # Set proportion of training data to use in tree
colsample_bytree = 0.9, # Set number of variables to use in each tree
nrounds = 1000, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:169.751058+1.089100 test-rmse:169.725113+4.346152
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:141.880723+0.955250 test-rmse:142.257888+4.498548
## [41] train-rmse:119.167794+0.872740 test-rmse:120.037299+4.713734
## [61] train-rmse:100.802743+0.811676 test-rmse:102.244318+4.944918
## [81] train-rmse:85.894818+0.782803 test-rmse:87.992390+5.136235
## [101] train-rmse:73.884222+0.717277 test-rmse:76.688176+5.352523
## [121] train-rmse:64.235126+0.729433 test-rmse:67.782069+5.497611
## [141] train-rmse:56.555812+0.710863 test-rmse:60.908277+5.599919
## [161] train-rmse:50.471793+0.734623 test-rmse:55.655064+5.607419
## [181] train-rmse:45.661772+0.732638 test-rmse:51.709027+5.614634
## [201] train-rmse:41.887600+0.722793 test-rmse:48.730275+5.541621
## [221] train-rmse:38.932038+0.739646 test-rmse:46.558135+5.472146
## [241] train-rmse:36.590506+0.768168 test-rmse:44.884299+5.330835
## [261] train-rmse:34.744114+0.813147 test-rmse:43.700276+5.208201
## [281] train-rmse:33.284552+0.846326 test-rmse:42.825866+5.126308
## [301] train-rmse:32.135062+0.861412 test-rmse:42.131992+5.028858
## [321] train-rmse:31.221536+0.881145 test-rmse:41.607400+4.923748
## [341] train-rmse:30.439584+0.886590 test-rmse:41.192316+4.900387
## [361] train-rmse:29.804281+0.905998 test-rmse:40.909016+4.860042
## [381] train-rmse:29.241552+0.922932 test-rmse:40.680168+4.796410
## [401] train-rmse:28.721517+0.933025 test-rmse:40.468060+4.768594
## [421] train-rmse:28.283970+0.935860 test-rmse:40.272850+4.720182
## [441] train-rmse:27.885668+0.920283 test-rmse:40.127046+4.688938
## [461] train-rmse:27.533809+0.925783 test-rmse:39.995131+4.670988
## [481] train-rmse:27.215313+0.941093 test-rmse:39.903066+4.656156
## [501] train-rmse:26.885082+0.940984 test-rmse:39.795102+4.644757
## [521] train-rmse:26.586074+0.934100 test-rmse:39.693998+4.626631
## [541] train-rmse:26.307228+0.938589 test-rmse:39.609207+4.626136
## [561] train-rmse:26.025758+0.940028 test-rmse:39.529579+4.622719
## [581] train-rmse:25.746053+0.936777 test-rmse:39.442170+4.614326
## [601] train-rmse:25.502729+0.946358 test-rmse:39.381470+4.598905
## [621] train-rmse:25.248953+0.947486 test-rmse:39.312105+4.599686
## [641] train-rmse:24.998050+0.944837 test-rmse:39.245589+4.600753
## [661] train-rmse:24.778642+0.947764 test-rmse:39.191367+4.593372
## [681] train-rmse:24.539269+0.941668 test-rmse:39.117881+4.599828
## [701] train-rmse:24.321892+0.945048 test-rmse:39.066549+4.594213
## [721] train-rmse:24.098167+0.938395 test-rmse:39.012077+4.608622
## [741] train-rmse:23.887989+0.959588 test-rmse:38.985871+4.620076
## [761] train-rmse:23.694833+0.949804 test-rmse:38.947540+4.640001
## [781] train-rmse:23.474198+0.947959 test-rmse:38.884040+4.646983
## [801] train-rmse:23.272271+0.959769 test-rmse:38.829581+4.644860
## [821] train-rmse:23.078820+0.972193 test-rmse:38.800542+4.651635
## [841] train-rmse:22.878935+0.989104 test-rmse:38.773344+4.661049
## [861] train-rmse:22.680724+0.991005 test-rmse:38.739232+4.658809
## [881] train-rmse:22.490293+0.999787 test-rmse:38.709619+4.679306
## [901] train-rmse:22.306570+1.001152 test-rmse:38.676327+4.677275
## [921] train-rmse:22.123903+0.999008 test-rmse:38.645407+4.681627
## [941] train-rmse:21.943767+1.001066 test-rmse:38.615389+4.679911
## [961] train-rmse:21.761151+1.000074 test-rmse:38.567272+4.681637
## [981] train-rmse:21.582176+0.993373 test-rmse:38.554173+4.684477
## [1000] train-rmse:21.412550+0.996742 test-rmse:38.517828+4.694399
set.seed(111111)
bst_mod_5 <- xgb.cv(data = dtrain, # Set training data
nfold = 5, # Use 5 fold cross-validation
eta = 0.005, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = 0.7, # Set proportion of training data to use in tree
colsample_bytree = 0.9, # Set number of variables to use in each tree
nrounds = 1000, # Set number of rounds
early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:170.515927+1.084205 test-rmse:170.479538+4.341769
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
##
## [21] train-rmse:155.827283+1.014768 test-rmse:155.990434+4.410453
## [41] train-rmse:142.537726+0.962567 test-rmse:142.924263+4.508570
## [61] train-rmse:130.591073+0.925791 test-rmse:131.199487+4.605702
## [81] train-rmse:119.789054+0.895809 test-rmse:120.669848+4.705063
## [101] train-rmse:110.056343+0.845651 test-rmse:111.196599+4.811344
## [121] train-rmse:101.275323+0.823743 test-rmse:102.666300+4.896093
## [141] train-rmse:93.393189+0.789487 test-rmse:95.099849+5.022794
## [161] train-rmse:86.321183+0.794908 test-rmse:88.338031+5.082084
## [181] train-rmse:79.951666+0.771745 test-rmse:82.326668+5.163812
## [201] train-rmse:74.251661+0.763005 test-rmse:76.992886+5.271831
## [221] train-rmse:69.158203+0.726369 test-rmse:72.264182+5.351938
## [241] train-rmse:64.579888+0.725927 test-rmse:68.076432+5.394121
## [261] train-rmse:60.479750+0.705619 test-rmse:64.392755+5.464637
## [281] train-rmse:56.845883+0.700854 test-rmse:61.167144+5.522503
## [301] train-rmse:53.602522+0.686929 test-rmse:58.307859+5.553678
## [321] train-rmse:50.719587+0.672038 test-rmse:55.839219+5.578217
## [341] train-rmse:48.167776+0.651343 test-rmse:53.715719+5.587653
## [361] train-rmse:45.891866+0.661654 test-rmse:51.861258+5.579469
## [381] train-rmse:43.881180+0.659681 test-rmse:50.250897+5.557247
## [401] train-rmse:42.082187+0.675744 test-rmse:48.839272+5.527592
## [421] train-rmse:40.496306+0.671839 test-rmse:47.657982+5.485309
## [441] train-rmse:39.104087+0.681247 test-rmse:46.638588+5.434602
## [461] train-rmse:37.844241+0.677333 test-rmse:45.744898+5.395461
## [481] train-rmse:36.740656+0.677980 test-rmse:44.970872+5.343011
## [501] train-rmse:35.750957+0.676105 test-rmse:44.295341+5.288149
## [521] train-rmse:34.891312+0.689270 test-rmse:43.743728+5.234937
## [541] train-rmse:34.103090+0.709530 test-rmse:43.233147+5.203919
## [561] train-rmse:33.406821+0.715545 test-rmse:42.798722+5.148838
## [581] train-rmse:32.795585+0.735327 test-rmse:42.430875+5.099515
## [601] train-rmse:32.237267+0.765588 test-rmse:42.128326+5.055134
## [621] train-rmse:31.736642+0.785292 test-rmse:41.855984+5.016137
## [641] train-rmse:31.290331+0.802890 test-rmse:41.615450+4.980566
## [661] train-rmse:30.876014+0.812018 test-rmse:41.391680+4.927557
## [681] train-rmse:30.510066+0.829420 test-rmse:41.211395+4.911014
## [701] train-rmse:30.151838+0.830699 test-rmse:41.016608+4.879565
## [721] train-rmse:29.831638+0.827219 test-rmse:40.865355+4.868538
## [741] train-rmse:29.549770+0.852200 test-rmse:40.744511+4.848055
## [761] train-rmse:29.270166+0.863540 test-rmse:40.617351+4.832612
## [781] train-rmse:29.002395+0.870984 test-rmse:40.489688+4.819068
## [801] train-rmse:28.770349+0.873786 test-rmse:40.379841+4.792393
## [821] train-rmse:28.543152+0.878570 test-rmse:40.283545+4.772872
## [841] train-rmse:28.333385+0.889542 test-rmse:40.201014+4.769705
## [861] train-rmse:28.132834+0.875737 test-rmse:40.118480+4.751033
## [881] train-rmse:27.951495+0.880850 test-rmse:40.042283+4.729199
## [901] train-rmse:27.766082+0.885420 test-rmse:39.968258+4.714211
## [921] train-rmse:27.585302+0.883028 test-rmse:39.895495+4.702939
## [941] train-rmse:27.413246+0.879127 test-rmse:39.836018+4.699596
## [961] train-rmse:27.239513+0.881716 test-rmse:39.779810+4.697129
## [981] train-rmse:27.078620+0.892377 test-rmse:39.737000+4.703278
## [1000] train-rmse:26.925898+0.881514 test-rmse:39.677833+4.703795
# eta plots
# Extract results for model with eta = 0.3
pd1 <- cbind.data.frame(bst_mod_1$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.3, nrow(bst_mod_1$evaluation_log)))
names(pd1)[3] <- "eta"
# Extract results for model with eta = 0.1
pd2 <- cbind.data.frame(bst_mod_2$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.1, nrow(bst_mod_2$evaluation_log)))
names(pd2)[3] <- "eta"
# Extract results for model with eta = 0.05
pd3 <- cbind.data.frame(bst_mod_3$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.05, nrow(bst_mod_3$evaluation_log)))
names(pd3)[3] <- "eta"
# Extract results for model with eta = 0.01
pd4 <- cbind.data.frame(bst_mod_4$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.01, nrow(bst_mod_4$evaluation_log)))
names(pd4)[3] <- "eta"
# Extract results for model with eta = 0.005
pd5 <- cbind.data.frame(bst_mod_5$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.005, nrow(bst_mod_5$evaluation_log)))
names(pd5)[3] <- "eta"
# Join datasets
plot_data <- rbind.data.frame(pd1, pd2, pd3, pd4, pd5)
# Converty ETA to factor
plot_data$eta <- as.factor(plot_data$eta)
# Plot points
g_6 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_point(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "RMSE v Number of Trees",
y = "RMSE", color = "Learning \n Rate") # Set labels
g_6
# Plot lines
g_7 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
geom_smooth(alpha = 0.5) +
theme_bw() + # Set theme
theme(panel.grid.major = element_blank(), # Remove grid
panel.grid.minor = element_blank(), # Remove grid
panel.border = element_blank(), # Remove grid
panel.background = element_blank()) + # Remove grid
labs(x = "Number of Trees", title = "RMSE v Number of Trees",
y = "RMSE", color = "Learning \n Rate") # Set labels
g_7
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
bst_mod_final <- xgboost(data = dtrain, # Set training data
eta = 0.1, # Set learning rate
max.depth = 5, # Set max depth
min_child_weight = 5, # Set minimum number of samples in node to split
gamma = 0, # Set minimum loss reduction for split
subsample = 0.7, # Set proportion of training data to use in tree
colsample_bytree = 0.9, # Set number of variables to use in each tree
nrounds = 243, # Set number of rounds
early_stopping_rounds = 50, # Set number of rounds to stop at if there is no improvement
verbose = 1, # 1 - Prints out fit
nthread = 1, # Set number of parallel threads
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:155.776041
## Will train until train_rmse hasn't improved in 50 rounds.
##
## [21] train-rmse:39.900436
## [41] train-rmse:29.229977
## [61] train-rmse:26.590835
## [81] train-rmse:24.482098
## [101] train-rmse:22.712217
## [121] train-rmse:21.010145
## [141] train-rmse:19.606408
## [161] train-rmse:18.283559
## [181] train-rmse:17.108097
## [201] train-rmse:15.869291
## [221] train-rmse:14.740592
## [241] train-rmse:13.799385
## [243] train-rmse:13.688852
bst_predsf <- predict(bst_mod_final , dtest)
print(accuracy(bst_predsf, test_data$tc))
## ME RMSE MAE MPE MAPE
## Test set 0.2126129 35.10928 21.65655 -3.955761 14.16861
RMSE has gone down
Produce final model
plot_dat <- cbind.data.frame(bst_predsf, test_data$tc)
names(plot_dat) <- c("predicted", "actual")
ggplot(plot_dat, aes ( x = predicted, y = actual)) +
geom_point() +
geom_smooth() +
xlim(-100, 850) +
ylim(-100, 850) +
geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Extract variable importance
# Extract importance
imp_mat <- xgb.importance(model = bst_mod_final) #what is the model here?
# Plot importance (top 10 variables)
xgb.plot.importance(imp_mat, top_n = 10)
Interpretation
cres and csor have the highest importance scores, indicating that these features contribute the most to the total cost predictions. These variables likely have strong predictive power or influence over the target variable
## SHAP
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = bst_mod_final,
X_train =as.matrix(train_dummy),
shap_approx = F)
## make SHAP score by decreasing order
tuned model
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(train_dummy),
top_n = 10)
plot.shap.summary(data_long = shap_long)
dtrain <- xgb.DMatrix(data = as.matrix(train_dummy[, 3:ncol(train_dummy)]), label = as.numeric(train_data$tc))
# Create test matrix
dtest <- xgb.DMatrix(data = as.matrix(test_dummy[, 3:ncol(train_dummy)]), label = as.numeric(test_data$tc))
XGBoost
set.seed(111111)
bst_1 <- xgboost(data = dtrain, # Set training data
nrounds = 100, # Set number of rounds
verbose = 1, # 1 - Prints out fit
print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1] train-rmse:126.003442
## [21] train-rmse:22.859882
## [41] train-rmse:15.842871
## [61] train-rmse:11.735085
## [81] train-rmse:8.815739
## [100] train-rmse:6.715222
bst_preds <- predict(bst_1, dtest)
print(accuracy(bst_preds, test_data$tc))
## ME RMSE MAE MPE MAPE
## Test set 1.338493 50.34017 30.71847 -5.845864 20.25521
MEA has gone down. However, RMSE has increased which might suggest underfitting
plot_dat <- cbind.data.frame(bst_preds, test_data$tc)
names(plot_dat) <- c("predicted", "actual")
ggplot(plot_dat, aes ( x = predicted, y = actual)) +
geom_point() +
geom_smooth() +
xlim(-100, 850) +
ylim(-100, 850) +
geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Interpretation Even better predicition
# Extract importance
imp_mat <- xgb.importance(model = bst_1)
# Plot importance (top 10 variables)
xgb.plot.importance(imp_mat, top_n = 10)
Interpretation
After removing cres and csors the feature that contributes the most to the total cost predictions is gdp.
source("~/Downloads/a_insights_shap_functions.r")
# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = bst_1,
X_train =as.matrix(train_dummy[, 3:ncol(train_dummy)]),
shap_approx = F)
## make SHAP score by decreasing order
shap_long = shap.prep(shap = shap_result,
X_train = as.matrix(train_dummy[, 3:ncol(train_dummy)]),
top_n = 10)
plot.shap.summary(data_long = shap_long)